Attempt to get some documentation going.

This commit is contained in:
Nicolas Patry
2020-09-25 21:43:02 +02:00
committed by Anthony MOI
parent 4929809af0
commit 655809c718
15 changed files with 270 additions and 16 deletions

View File

@@ -41,13 +41,10 @@ Start using in a matter of seconds:
```python
# Tokenizers provides ultra-fast implementations of most current tokenizers:
>>> from tokenizers import (ByteLevelBPETokenizer,
CharBPETokenizer,
SentencePieceBPETokenizer,
BertWordPieceTokenizer)
>>> from tokenizers import Tokenizer
# Ultra-fast => they can encode 1GB of text in ~20sec on a standard server's CPU
# Tokenizers can be easily instantiated from standard files
>>> tokenizer = BertWordPieceTokenizer("bert-base-uncased-vocab.txt", lowercase=True)
>>> tokenizer = Tokenizer.from_file("bert-base-uncased-vocab.json")
Tokenizer(vocabulary_size=30522, model=BertWordPiece, add_special_tokens=True, unk_token=[UNK],
sep_token=[SEP], cls_token=[CLS], clean_text=True, handle_chinese_chars=True,
strip_accents=True, lowercase=True, wordpieces_prefix=##)
@@ -70,12 +67,26 @@ And training a new vocabulary is just as easy:
```python
# You can also train a BPE/Byte-levelBPE/WordPiece vocabulary on your own files
>>> tokenizer = ByteLevelBPETokenizer()
>>> unk_token = "[UNK]"
>>> replacement = "▁"
>>> add_prefix_space = True
>>> tokenizer = Tokenizer(BPE(unk_token=unk_token))
>>> tokenizer.normalizer = NFKC()
>>> tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
)
>>> tokenizer.decoder = decoders.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
)
>>> tokenizer.add_special_tokens([unk_token])
>>> tokenizer.train(["wiki.test.raw"], vocab_size=20000)
[00:00:00] Tokenize words ████████████████████████████████████████ 20993/20993
[00:00:00] Count pairs ████████████████████████████████████████ 20993/20993
[00:00:03] Compute merges ████████████████████████████████████████ 19375/19375
```
You can check the guide [to build your own tokenizer](...)
## Contributors

30
bindings/node/Makefile Normal file
View File

@@ -0,0 +1,30 @@
.PHONY: style check-style test
DATA_DIR = data
dir_guard=@mkdir -p $(@D)
# Format source code automatically
style:
npm run lint
# Check the source code is formatted correctly
check-style:
npm run lint-check
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json
# Launch the test suite
test: $(TESTS_RESOURCES)
npm run test
$(DATA_DIR)/big.txt :
$(dir_guard)
wget https://norvig.com/big.txt -O $@
$(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt
head -100 $(DATA_DIR)/big.txt > $@
$(DATA_DIR)/roberta.json :
$(dir_guard)
wget https://storage.googleapis.com/tokenizers/roberta.json -O $@

View File

@@ -0,0 +1,20 @@
/* eslint-disable @typescript-eslint/no-explicit-any */
/* eslint-disable @typescript-eslint/no-empty-function */
const tokenizers = await import("tokenizers");
describe("loadExample", () => {
it("", () => {
const example = "This is an example";
const ids = [713, 16, 41, 1246];
const tokens = ["This", "Ġis", "Ġan", "Ġexample"];
const tokenizer = tokenizers.Tokenizer.fromFile("data/roberta.json");
const encoded = tokenizer.encode(example);
expect(encoded.ids).toBe(ids);
expect(encoded.tokens).toBe(tokens);
expect(tokenizer.decode(ids)).toBe(example);
});
});

View File

@@ -0,0 +1,46 @@
/* eslint-disable @typescript-eslint/no-explicit-any */
/* eslint-disable @typescript-eslint/no-empty-function */
const {
Tokenizer,
models,
normalizers,
pre_tokenizers,
post_processors,
decoders,
trainers,
AddedToken,
} = await import("tokenizers");
describe("trainExample", () => {
it("", () => {
const vocab_size = 100;
const tokenizer = new Tokenizer(models.BPE.empty());
tokenizer.normalizer = normalizers.sequenceNormalizer([
normalizers.stripNormalizer(),
normalizers.nfcNormalizer(),
]);
tokenizer.pre_tokenizer = pre_tokenizers.byteLevelPreTokenizer();
tokenizer.post_processor = post_processors.byteLevelProcessing();
tokenizer.decoder = decoders.byteLevelDecoder();
const trainer = trainers.bpeTrainer({
vocab_size,
min_frequency: 0,
special_tokens: [
new AddedToken("<s>", true),
new AddedToken("<pad>", true),
new AddedToken("</s>", true),
new AddedToken("<unk>", true),
new AddedToken("<mask>", true),
],
show_progress: true,
});
tokenizer.train(trainer, ["data/small.txt"]);
tokenizer.save("data/tokenizer.json");
expect(1).toBe(1);
});
});

View File

@@ -10,7 +10,14 @@ export {
InputSequence,
EncodeInput,
EncodeOptions,
Tokenizer,
TruncationConfiguration,
TruncationOptions,
} from "./bindings/tokenizer";
export * as models from "./bindings/models";
export * as normalizers from "./bindings/normalizers";
export * as pre_tokenizers from "./bindings/pre-tokenizers";
export * as decoders from "./bindings/decoders";
export * as post_processors from "./bindings/post-processors";
export * as trainers from "./bindings/trainers";
export { Encoding } from "./implementations/encoding";

View File

@@ -966,8 +966,8 @@ pub fn tokenizer_from_string(mut cx: FunctionContext) -> JsResult<JsTokenizer> {
pub fn tokenizer_from_file(mut cx: FunctionContext) -> JsResult<JsTokenizer> {
let s = cx.extract::<String>(0)?;
let tokenizer =
tk::tokenizer::TokenizerImpl::from_file(s).map_err(|e| Error(format!("{}", e)))?;
let tokenizer = tk::tokenizer::TokenizerImpl::from_file(s)
.map_err(|e| Error(format!("Error loading from file{}", e)))?;
let mut js_tokenizer = JsTokenizer::new::<_, JsTokenizer, _>(&mut cx, vec![])?;
let guard = cx.lock();

View File

@@ -1,5 +1,9 @@
.PHONY: style check-style test
DATA_DIR = data
dir_guard=@mkdir -p $(@D)
# Format source code automatically
style:
black --line-length 100 --target-version py35 examples py_src/tokenizers tests
@@ -8,7 +12,20 @@ style:
check-style:
black --check --line-length 100 --target-version py35 examples py_src/tokenizers tests
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json
# Launch the test suite
test:
test: $(TESTS_RESOURCES)
python -m pytest -s -v tests
cargo test --no-default-features
$(DATA_DIR)/big.txt :
$(dir_guard)
wget https://norvig.com/big.txt -O $@
$(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt
head -100 $(DATA_DIR)/big.txt > $@
$(DATA_DIR)/roberta.json :
$(dir_guard)
wget https://storage.googleapis.com/tokenizers/roberta.json -O $@

View File

@@ -0,0 +1,16 @@
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_file("data/roberta.json")
example = "This is an example"
ids = [713, 16, 41, 1246]
tokens = ["This", "Ġis", "Ġan", "Ġexample"]
encodings = tokenizer.encode(example)
assert encodings.ids == ids
assert encodings.tokens == tokens
decoded = tokenizer.decode(ids)
assert decoded == example

View File

@@ -0,0 +1,40 @@
from tokenizers import (
Tokenizer,
normalizers,
pre_tokenizers,
models,
decoders,
processors,
trainers,
AddedToken,
)
vocab_size = 100
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence(
[
normalizers.Strip(),
normalizers.NFC(),
]
)
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.post_processor = processors.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.BpeTrainer(
vocab_size=vocab_size,
min_frequency=0,
special_tokens=[
AddedToken("<s>"),
AddedToken("<pad>"),
AddedToken("</s>"),
AddedToken("<unk>"),
AddedToken("<mask>"),
],
show_progress=False,
)
tokenizer.train(trainer, ["data/small.txt"])
tokenizer.save("data/tokenizer.json")

View File

@@ -9,6 +9,8 @@ class TestByteLevelBPE:
tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"])
output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
tokenizer.save("roberta.json")
assert output.ids == [133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
assert output.tokens == [
"The",

View File

@@ -0,0 +1,17 @@
use tokenizers::Tokenizer;
fn main() {
let tokenizer: Tokenizer = Tokenizer::from_file("data/roberta.json").unwrap();
let example = "This is an example";
let ids = vec![713, 16, 41, 1246];
let tokens = vec!["This", "Ġis", "Ġan", "Ġexample"];
let encodings = tokenizer.encode(example, false).unwrap();
assert_eq!(encodings.get_ids(), ids);
assert_eq!(encodings.get_tokens(), tokens);
let decoded = tokenizer.decode(ids, false).unwrap();
assert_eq!(decoded, example);
}

View File

@@ -0,0 +1,39 @@
use tokenizers::models::bpe::{BpeTrainerBuilder, BPE};
use tokenizers::normalizers::{Sequence, Strip, NFC};
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
use tokenizers::{AddedToken, TokenizerBuilder};
fn main() {
let vocab_size: usize = 100;
let tokenizer = TokenizerBuilder::new()
.with_model(BPE::default())
.with_normalizer(Some(Sequence::new(vec![
Strip::new(true, true).into(),
NFC.into(),
])))
.with_pre_tokenizer(Some(ByteLevel::default()))
.with_post_processor(Some(ByteLevel::default()))
.with_decoder(Some(ByteLevel::default()))
.build()
.unwrap();
let trainer = BpeTrainerBuilder::new()
.show_progress(false)
.vocab_size(vocab_size)
.min_frequency(0)
.special_tokens(vec![
AddedToken::from(String::from("<s>"), true),
AddedToken::from(String::from("<pad>"), true),
AddedToken::from(String::from("</s>"), true),
AddedToken::from(String::from("<unk>"), true),
AddedToken::from(String::from("<mask>"), true),
])
.build();
let pretty = true;
tokenizer
.train(&trainer, vec!["data/small.txt".to_string()])
.unwrap()
.save("data/tokenizer.json", pretty)
.unwrap();
}

View File

@@ -74,21 +74,21 @@
//! let tokenizer = TokenizerBuilder::new()
//! .with_model(BPE::default())
//! .with_normalizer(Some(Sequence::new(vec![
//! NormalizerWrapper::StripNormalizer(Strip::new(true, true)),
//! NormalizerWrapper::NFC(NFC),
//! Strip::new(true, true).into(),
//! NFC.into(),
//! ])))
//! .with_pre_tokenizer(Some(PreTokenizerWrapper::ByteLevel(ByteLevel::default())))
//! .with_post_processor(Some(PostProcessorWrapper::ByteLevel(ByteLevel::default())))
//! .with_decoder(Some(DecoderWrapper::ByteLevel(ByteLevel::default())))
//! .with_pre_tokenizer(Some(ByteLevel::default()))
//! .with_post_processor(Some(ByteLevel::default()))
//! .with_decoder(Some(ByteLevel::default()))
//! .build()?;
//!
//! let pretty = false;
//! tokenizer
//! .train(
//! &trainer,
//! vec!["path/to/vocab.txt".to_string()],
//! )?
//! .get_model()
//! .save(Path::new("result-folder"), Some("some-prefix"))?;
//! .save("tokenizer.json", pretty)?;
//!
//! Ok(())
//! }

View File

@@ -383,12 +383,21 @@ impl Tokenizer {
> {
self.0
}
pub fn from_file<P: AsRef<Path>>(file: P) -> Result<Self> {
let content = read_to_string(file)?;
Ok(serde_json::from_str(&content)?)
}
}
impl std::str::FromStr for Tokenizer {
type Err = Box<dyn std::error::Error + Send + Sync>;
fn from_str(s: &str) -> Result<Self> {
Ok(serde_json::from_str(s)?)
}
}
impl<M, N, PT, PP, D> From<TokenizerImpl<M, N, PT, PP, D>> for Tokenizer
where
M: Into<ModelWrapper>,