mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-03 11:18:29 +00:00
Attempt to get some documentation going.
This commit is contained in:
committed by
Anthony MOI
parent
4929809af0
commit
655809c718
23
README.md
23
README.md
@@ -41,13 +41,10 @@ Start using in a matter of seconds:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
# Tokenizers provides ultra-fast implementations of most current tokenizers:
|
# Tokenizers provides ultra-fast implementations of most current tokenizers:
|
||||||
>>> from tokenizers import (ByteLevelBPETokenizer,
|
>>> from tokenizers import Tokenizer
|
||||||
CharBPETokenizer,
|
|
||||||
SentencePieceBPETokenizer,
|
|
||||||
BertWordPieceTokenizer)
|
|
||||||
# Ultra-fast => they can encode 1GB of text in ~20sec on a standard server's CPU
|
# Ultra-fast => they can encode 1GB of text in ~20sec on a standard server's CPU
|
||||||
# Tokenizers can be easily instantiated from standard files
|
# Tokenizers can be easily instantiated from standard files
|
||||||
>>> tokenizer = BertWordPieceTokenizer("bert-base-uncased-vocab.txt", lowercase=True)
|
>>> tokenizer = Tokenizer.from_file("bert-base-uncased-vocab.json")
|
||||||
Tokenizer(vocabulary_size=30522, model=BertWordPiece, add_special_tokens=True, unk_token=[UNK],
|
Tokenizer(vocabulary_size=30522, model=BertWordPiece, add_special_tokens=True, unk_token=[UNK],
|
||||||
sep_token=[SEP], cls_token=[CLS], clean_text=True, handle_chinese_chars=True,
|
sep_token=[SEP], cls_token=[CLS], clean_text=True, handle_chinese_chars=True,
|
||||||
strip_accents=True, lowercase=True, wordpieces_prefix=##)
|
strip_accents=True, lowercase=True, wordpieces_prefix=##)
|
||||||
@@ -70,12 +67,26 @@ And training a new vocabulary is just as easy:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
# You can also train a BPE/Byte-levelBPE/WordPiece vocabulary on your own files
|
# You can also train a BPE/Byte-levelBPE/WordPiece vocabulary on your own files
|
||||||
>>> tokenizer = ByteLevelBPETokenizer()
|
>>> unk_token = "[UNK]"
|
||||||
|
>>> replacement = "▁"
|
||||||
|
>>> add_prefix_space = True
|
||||||
|
|
||||||
|
>>> tokenizer = Tokenizer(BPE(unk_token=unk_token))
|
||||||
|
>>> tokenizer.normalizer = NFKC()
|
||||||
|
>>> tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
|
||||||
|
replacement=replacement, add_prefix_space=add_prefix_space
|
||||||
|
)
|
||||||
|
>>> tokenizer.decoder = decoders.Metaspace(
|
||||||
|
replacement=replacement, add_prefix_space=add_prefix_space
|
||||||
|
)
|
||||||
|
>>> tokenizer.add_special_tokens([unk_token])
|
||||||
>>> tokenizer.train(["wiki.test.raw"], vocab_size=20000)
|
>>> tokenizer.train(["wiki.test.raw"], vocab_size=20000)
|
||||||
[00:00:00] Tokenize words ████████████████████████████████████████ 20993/20993
|
[00:00:00] Tokenize words ████████████████████████████████████████ 20993/20993
|
||||||
[00:00:00] Count pairs ████████████████████████████████████████ 20993/20993
|
[00:00:00] Count pairs ████████████████████████████████████████ 20993/20993
|
||||||
[00:00:03] Compute merges ████████████████████████████████████████ 19375/19375
|
[00:00:03] Compute merges ████████████████████████████████████████ 19375/19375
|
||||||
```
|
```
|
||||||
|
|
||||||
|
You can check the guide [to build your own tokenizer](...)
|
||||||
|
|
||||||
## Contributors
|
## Contributors
|
||||||
|
|
||||||
|
|||||||
30
bindings/node/Makefile
Normal file
30
bindings/node/Makefile
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
.PHONY: style check-style test
|
||||||
|
|
||||||
|
DATA_DIR = data
|
||||||
|
|
||||||
|
dir_guard=@mkdir -p $(@D)
|
||||||
|
|
||||||
|
# Format source code automatically
|
||||||
|
style:
|
||||||
|
npm run lint
|
||||||
|
|
||||||
|
# Check the source code is formatted correctly
|
||||||
|
check-style:
|
||||||
|
npm run lint-check
|
||||||
|
|
||||||
|
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json
|
||||||
|
|
||||||
|
# Launch the test suite
|
||||||
|
test: $(TESTS_RESOURCES)
|
||||||
|
npm run test
|
||||||
|
|
||||||
|
$(DATA_DIR)/big.txt :
|
||||||
|
$(dir_guard)
|
||||||
|
wget https://norvig.com/big.txt -O $@
|
||||||
|
|
||||||
|
$(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt
|
||||||
|
head -100 $(DATA_DIR)/big.txt > $@
|
||||||
|
|
||||||
|
$(DATA_DIR)/roberta.json :
|
||||||
|
$(dir_guard)
|
||||||
|
wget https://storage.googleapis.com/tokenizers/roberta.json -O $@
|
||||||
20
bindings/node/examples/load.test.ts
Normal file
20
bindings/node/examples/load.test.ts
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
/* eslint-disable @typescript-eslint/no-explicit-any */
|
||||||
|
/* eslint-disable @typescript-eslint/no-empty-function */
|
||||||
|
|
||||||
|
const tokenizers = await import("tokenizers");
|
||||||
|
|
||||||
|
describe("loadExample", () => {
|
||||||
|
it("", () => {
|
||||||
|
const example = "This is an example";
|
||||||
|
const ids = [713, 16, 41, 1246];
|
||||||
|
const tokens = ["This", "Ġis", "Ġan", "Ġexample"];
|
||||||
|
|
||||||
|
const tokenizer = tokenizers.Tokenizer.fromFile("data/roberta.json");
|
||||||
|
const encoded = tokenizer.encode(example);
|
||||||
|
|
||||||
|
expect(encoded.ids).toBe(ids);
|
||||||
|
expect(encoded.tokens).toBe(tokens);
|
||||||
|
|
||||||
|
expect(tokenizer.decode(ids)).toBe(example);
|
||||||
|
});
|
||||||
|
});
|
||||||
46
bindings/node/examples/train.test.ts
Normal file
46
bindings/node/examples/train.test.ts
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
/* eslint-disable @typescript-eslint/no-explicit-any */
|
||||||
|
/* eslint-disable @typescript-eslint/no-empty-function */
|
||||||
|
|
||||||
|
const {
|
||||||
|
Tokenizer,
|
||||||
|
models,
|
||||||
|
normalizers,
|
||||||
|
pre_tokenizers,
|
||||||
|
post_processors,
|
||||||
|
decoders,
|
||||||
|
trainers,
|
||||||
|
AddedToken,
|
||||||
|
} = await import("tokenizers");
|
||||||
|
|
||||||
|
describe("trainExample", () => {
|
||||||
|
it("", () => {
|
||||||
|
const vocab_size = 100;
|
||||||
|
|
||||||
|
const tokenizer = new Tokenizer(models.BPE.empty());
|
||||||
|
tokenizer.normalizer = normalizers.sequenceNormalizer([
|
||||||
|
normalizers.stripNormalizer(),
|
||||||
|
normalizers.nfcNormalizer(),
|
||||||
|
]);
|
||||||
|
tokenizer.pre_tokenizer = pre_tokenizers.byteLevelPreTokenizer();
|
||||||
|
tokenizer.post_processor = post_processors.byteLevelProcessing();
|
||||||
|
tokenizer.decoder = decoders.byteLevelDecoder();
|
||||||
|
|
||||||
|
const trainer = trainers.bpeTrainer({
|
||||||
|
vocab_size,
|
||||||
|
min_frequency: 0,
|
||||||
|
special_tokens: [
|
||||||
|
new AddedToken("<s>", true),
|
||||||
|
new AddedToken("<pad>", true),
|
||||||
|
new AddedToken("</s>", true),
|
||||||
|
new AddedToken("<unk>", true),
|
||||||
|
new AddedToken("<mask>", true),
|
||||||
|
],
|
||||||
|
show_progress: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
tokenizer.train(trainer, ["data/small.txt"]);
|
||||||
|
tokenizer.save("data/tokenizer.json");
|
||||||
|
|
||||||
|
expect(1).toBe(1);
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -10,7 +10,14 @@ export {
|
|||||||
InputSequence,
|
InputSequence,
|
||||||
EncodeInput,
|
EncodeInput,
|
||||||
EncodeOptions,
|
EncodeOptions,
|
||||||
|
Tokenizer,
|
||||||
TruncationConfiguration,
|
TruncationConfiguration,
|
||||||
TruncationOptions,
|
TruncationOptions,
|
||||||
} from "./bindings/tokenizer";
|
} from "./bindings/tokenizer";
|
||||||
|
export * as models from "./bindings/models";
|
||||||
|
export * as normalizers from "./bindings/normalizers";
|
||||||
|
export * as pre_tokenizers from "./bindings/pre-tokenizers";
|
||||||
|
export * as decoders from "./bindings/decoders";
|
||||||
|
export * as post_processors from "./bindings/post-processors";
|
||||||
|
export * as trainers from "./bindings/trainers";
|
||||||
export { Encoding } from "./implementations/encoding";
|
export { Encoding } from "./implementations/encoding";
|
||||||
|
|||||||
@@ -966,8 +966,8 @@ pub fn tokenizer_from_string(mut cx: FunctionContext) -> JsResult<JsTokenizer> {
|
|||||||
pub fn tokenizer_from_file(mut cx: FunctionContext) -> JsResult<JsTokenizer> {
|
pub fn tokenizer_from_file(mut cx: FunctionContext) -> JsResult<JsTokenizer> {
|
||||||
let s = cx.extract::<String>(0)?;
|
let s = cx.extract::<String>(0)?;
|
||||||
|
|
||||||
let tokenizer =
|
let tokenizer = tk::tokenizer::TokenizerImpl::from_file(s)
|
||||||
tk::tokenizer::TokenizerImpl::from_file(s).map_err(|e| Error(format!("{}", e)))?;
|
.map_err(|e| Error(format!("Error loading from file{}", e)))?;
|
||||||
|
|
||||||
let mut js_tokenizer = JsTokenizer::new::<_, JsTokenizer, _>(&mut cx, vec![])?;
|
let mut js_tokenizer = JsTokenizer::new::<_, JsTokenizer, _>(&mut cx, vec![])?;
|
||||||
let guard = cx.lock();
|
let guard = cx.lock();
|
||||||
|
|||||||
@@ -1,5 +1,9 @@
|
|||||||
.PHONY: style check-style test
|
.PHONY: style check-style test
|
||||||
|
|
||||||
|
DATA_DIR = data
|
||||||
|
|
||||||
|
dir_guard=@mkdir -p $(@D)
|
||||||
|
|
||||||
# Format source code automatically
|
# Format source code automatically
|
||||||
style:
|
style:
|
||||||
black --line-length 100 --target-version py35 examples py_src/tokenizers tests
|
black --line-length 100 --target-version py35 examples py_src/tokenizers tests
|
||||||
@@ -8,7 +12,20 @@ style:
|
|||||||
check-style:
|
check-style:
|
||||||
black --check --line-length 100 --target-version py35 examples py_src/tokenizers tests
|
black --check --line-length 100 --target-version py35 examples py_src/tokenizers tests
|
||||||
|
|
||||||
|
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json
|
||||||
|
|
||||||
# Launch the test suite
|
# Launch the test suite
|
||||||
test:
|
test: $(TESTS_RESOURCES)
|
||||||
python -m pytest -s -v tests
|
python -m pytest -s -v tests
|
||||||
cargo test --no-default-features
|
cargo test --no-default-features
|
||||||
|
|
||||||
|
$(DATA_DIR)/big.txt :
|
||||||
|
$(dir_guard)
|
||||||
|
wget https://norvig.com/big.txt -O $@
|
||||||
|
|
||||||
|
$(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt
|
||||||
|
head -100 $(DATA_DIR)/big.txt > $@
|
||||||
|
|
||||||
|
$(DATA_DIR)/roberta.json :
|
||||||
|
$(dir_guard)
|
||||||
|
wget https://storage.googleapis.com/tokenizers/roberta.json -O $@
|
||||||
|
|||||||
0
bindings/python/tests/examples/__init__.py
Normal file
0
bindings/python/tests/examples/__init__.py
Normal file
16
bindings/python/tests/examples/test_load.py
Normal file
16
bindings/python/tests/examples/test_load.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
from tokenizers import Tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
tokenizer = Tokenizer.from_file("data/roberta.json")
|
||||||
|
|
||||||
|
example = "This is an example"
|
||||||
|
ids = [713, 16, 41, 1246]
|
||||||
|
tokens = ["This", "Ġis", "Ġan", "Ġexample"]
|
||||||
|
|
||||||
|
encodings = tokenizer.encode(example)
|
||||||
|
|
||||||
|
assert encodings.ids == ids
|
||||||
|
assert encodings.tokens == tokens
|
||||||
|
|
||||||
|
decoded = tokenizer.decode(ids)
|
||||||
|
assert decoded == example
|
||||||
40
bindings/python/tests/examples/test_train.py
Normal file
40
bindings/python/tests/examples/test_train.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
from tokenizers import (
|
||||||
|
Tokenizer,
|
||||||
|
normalizers,
|
||||||
|
pre_tokenizers,
|
||||||
|
models,
|
||||||
|
decoders,
|
||||||
|
processors,
|
||||||
|
trainers,
|
||||||
|
AddedToken,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
vocab_size = 100
|
||||||
|
|
||||||
|
tokenizer = Tokenizer(models.BPE())
|
||||||
|
tokenizer.normalizer = normalizers.Sequence(
|
||||||
|
[
|
||||||
|
normalizers.Strip(),
|
||||||
|
normalizers.NFC(),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
||||||
|
tokenizer.post_processor = processors.ByteLevel()
|
||||||
|
tokenizer.decoder = decoders.ByteLevel()
|
||||||
|
|
||||||
|
trainer = trainers.BpeTrainer(
|
||||||
|
vocab_size=vocab_size,
|
||||||
|
min_frequency=0,
|
||||||
|
special_tokens=[
|
||||||
|
AddedToken("<s>"),
|
||||||
|
AddedToken("<pad>"),
|
||||||
|
AddedToken("</s>"),
|
||||||
|
AddedToken("<unk>"),
|
||||||
|
AddedToken("<mask>"),
|
||||||
|
],
|
||||||
|
show_progress=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer.train(trainer, ["data/small.txt"])
|
||||||
|
tokenizer.save("data/tokenizer.json")
|
||||||
@@ -9,6 +9,8 @@ class TestByteLevelBPE:
|
|||||||
tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"])
|
tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"])
|
||||||
output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
|
output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
|
||||||
|
|
||||||
|
tokenizer.save("roberta.json")
|
||||||
|
|
||||||
assert output.ids == [133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
|
assert output.ids == [133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
|
||||||
assert output.tokens == [
|
assert output.tokens == [
|
||||||
"The",
|
"The",
|
||||||
|
|||||||
17
tokenizers/examples/load.rs
Normal file
17
tokenizers/examples/load.rs
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
use tokenizers::Tokenizer;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let tokenizer: Tokenizer = Tokenizer::from_file("data/roberta.json").unwrap();
|
||||||
|
|
||||||
|
let example = "This is an example";
|
||||||
|
let ids = vec![713, 16, 41, 1246];
|
||||||
|
let tokens = vec!["This", "Ġis", "Ġan", "Ġexample"];
|
||||||
|
|
||||||
|
let encodings = tokenizer.encode(example, false).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(encodings.get_ids(), ids);
|
||||||
|
assert_eq!(encodings.get_tokens(), tokens);
|
||||||
|
|
||||||
|
let decoded = tokenizer.decode(ids, false).unwrap();
|
||||||
|
assert_eq!(decoded, example);
|
||||||
|
}
|
||||||
39
tokenizers/examples/train.rs
Normal file
39
tokenizers/examples/train.rs
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
use tokenizers::models::bpe::{BpeTrainerBuilder, BPE};
|
||||||
|
use tokenizers::normalizers::{Sequence, Strip, NFC};
|
||||||
|
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
|
||||||
|
use tokenizers::{AddedToken, TokenizerBuilder};
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let vocab_size: usize = 100;
|
||||||
|
let tokenizer = TokenizerBuilder::new()
|
||||||
|
.with_model(BPE::default())
|
||||||
|
.with_normalizer(Some(Sequence::new(vec![
|
||||||
|
Strip::new(true, true).into(),
|
||||||
|
NFC.into(),
|
||||||
|
])))
|
||||||
|
.with_pre_tokenizer(Some(ByteLevel::default()))
|
||||||
|
.with_post_processor(Some(ByteLevel::default()))
|
||||||
|
.with_decoder(Some(ByteLevel::default()))
|
||||||
|
.build()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let trainer = BpeTrainerBuilder::new()
|
||||||
|
.show_progress(false)
|
||||||
|
.vocab_size(vocab_size)
|
||||||
|
.min_frequency(0)
|
||||||
|
.special_tokens(vec![
|
||||||
|
AddedToken::from(String::from("<s>"), true),
|
||||||
|
AddedToken::from(String::from("<pad>"), true),
|
||||||
|
AddedToken::from(String::from("</s>"), true),
|
||||||
|
AddedToken::from(String::from("<unk>"), true),
|
||||||
|
AddedToken::from(String::from("<mask>"), true),
|
||||||
|
])
|
||||||
|
.build();
|
||||||
|
|
||||||
|
let pretty = true;
|
||||||
|
tokenizer
|
||||||
|
.train(&trainer, vec!["data/small.txt".to_string()])
|
||||||
|
.unwrap()
|
||||||
|
.save("data/tokenizer.json", pretty)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
@@ -74,21 +74,21 @@
|
|||||||
//! let tokenizer = TokenizerBuilder::new()
|
//! let tokenizer = TokenizerBuilder::new()
|
||||||
//! .with_model(BPE::default())
|
//! .with_model(BPE::default())
|
||||||
//! .with_normalizer(Some(Sequence::new(vec![
|
//! .with_normalizer(Some(Sequence::new(vec![
|
||||||
//! NormalizerWrapper::StripNormalizer(Strip::new(true, true)),
|
//! Strip::new(true, true).into(),
|
||||||
//! NormalizerWrapper::NFC(NFC),
|
//! NFC.into(),
|
||||||
//! ])))
|
//! ])))
|
||||||
//! .with_pre_tokenizer(Some(PreTokenizerWrapper::ByteLevel(ByteLevel::default())))
|
//! .with_pre_tokenizer(Some(ByteLevel::default()))
|
||||||
//! .with_post_processor(Some(PostProcessorWrapper::ByteLevel(ByteLevel::default())))
|
//! .with_post_processor(Some(ByteLevel::default()))
|
||||||
//! .with_decoder(Some(DecoderWrapper::ByteLevel(ByteLevel::default())))
|
//! .with_decoder(Some(ByteLevel::default()))
|
||||||
//! .build()?;
|
//! .build()?;
|
||||||
//!
|
//!
|
||||||
|
//! let pretty = false;
|
||||||
//! tokenizer
|
//! tokenizer
|
||||||
//! .train(
|
//! .train(
|
||||||
//! &trainer,
|
//! &trainer,
|
||||||
//! vec!["path/to/vocab.txt".to_string()],
|
//! vec!["path/to/vocab.txt".to_string()],
|
||||||
//! )?
|
//! )?
|
||||||
//! .get_model()
|
//! .save("tokenizer.json", pretty)?;
|
||||||
//! .save(Path::new("result-folder"), Some("some-prefix"))?;
|
|
||||||
//!
|
//!
|
||||||
//! Ok(())
|
//! Ok(())
|
||||||
//! }
|
//! }
|
||||||
|
|||||||
@@ -383,12 +383,21 @@ impl Tokenizer {
|
|||||||
> {
|
> {
|
||||||
self.0
|
self.0
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn from_file<P: AsRef<Path>>(file: P) -> Result<Self> {
|
pub fn from_file<P: AsRef<Path>>(file: P) -> Result<Self> {
|
||||||
let content = read_to_string(file)?;
|
let content = read_to_string(file)?;
|
||||||
Ok(serde_json::from_str(&content)?)
|
Ok(serde_json::from_str(&content)?)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl std::str::FromStr for Tokenizer {
|
||||||
|
type Err = Box<dyn std::error::Error + Send + Sync>;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> Result<Self> {
|
||||||
|
Ok(serde_json::from_str(s)?)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<M, N, PT, PP, D> From<TokenizerImpl<M, N, PT, PP, D>> for Tokenizer
|
impl<M, N, PT, PP, D> From<TokenizerImpl<M, N, PT, PP, D>> for Tokenizer
|
||||||
where
|
where
|
||||||
M: Into<ModelWrapper>,
|
M: Into<ModelWrapper>,
|
||||||
|
|||||||
Reference in New Issue
Block a user