mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-03 11:18:29 +00:00
testable example docs for training-serialization (#373)
* testable usage docs for training and serialization and reference in README.md * Generate Readme from testable examples + template * add up-to-date check for Readme with generated one * try make pipeline fail by adding something to the lib.rs readme * remove difference from lib.rs again to make pipeline pass * fix black version Co-authored-by: Simon Ertl <simon@Simons-MacBook-Pro.local>
This commit is contained in:
2
.github/workflows/python.yml
vendored
2
.github/workflows/python.yml
vendored
@@ -25,7 +25,7 @@ jobs:
|
|||||||
architecture: "x64"
|
architecture: "x64"
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: pip install black
|
run: pip install black==19.10b0
|
||||||
|
|
||||||
- name: Check style
|
- name: Check style
|
||||||
working-directory: ./bindings/python
|
working-directory: ./bindings/python
|
||||||
|
|||||||
14
.github/workflows/rust.yml
vendored
14
.github/workflows/rust.yml
vendored
@@ -29,6 +29,13 @@ jobs:
|
|||||||
- if: matrix.os == 'ubuntu-latest'
|
- if: matrix.os == 'ubuntu-latest'
|
||||||
run: sudo chown -R $(whoami):$(id -ng) ~/.cargo/
|
run: sudo chown -R $(whoami):$(id -ng) ~/.cargo/
|
||||||
|
|
||||||
|
- name: Install cargo-readme for Ubuntu
|
||||||
|
if: matrix.os == 'ubuntu-latest'
|
||||||
|
uses: actions-rs/cargo@v1
|
||||||
|
with:
|
||||||
|
command: install
|
||||||
|
args: cargo-readme
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
uses: actions-rs/cargo@v1
|
uses: actions-rs/cargo@v1
|
||||||
with:
|
with:
|
||||||
@@ -73,3 +80,10 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
command: test
|
command: test
|
||||||
args: --verbose --manifest-path ./tokenizers/Cargo.toml --doc
|
args: --verbose --manifest-path ./tokenizers/Cargo.toml --doc
|
||||||
|
|
||||||
|
# Verify that Readme.md is up to date.
|
||||||
|
- name: Make sure, Readme generated from lib.rs matches actual Readme
|
||||||
|
if: matrix.os == 'ubuntu-latest'
|
||||||
|
shell: bash
|
||||||
|
working-directory: ./tokenizers
|
||||||
|
run: cargo readme > must_match_readme.md && diff must_match_readme.md README.md
|
||||||
@@ -33,65 +33,75 @@ The various steps of the pipeline are:
|
|||||||
4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything relevant
|
4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything relevant
|
||||||
that, for example, a language model would need, such as special tokens.
|
that, for example, a language model would need, such as special tokens.
|
||||||
|
|
||||||
## Quick example
|
### Deserialization and tokenization example
|
||||||
|
|
||||||
Train and serialize a Tokenizer.
|
```rust
|
||||||
|
use tokenizers::tokenizer::{Result, Tokenizer, EncodeInput};
|
||||||
|
use tokenizers::models::bpe::BPE;
|
||||||
|
|
||||||
|
fn main() -> Result<()> {
|
||||||
|
let bpe_builder = BPE::from_files("./path/to/vocab.json", "./path/to/merges.txt");
|
||||||
|
let bpe = bpe_builder
|
||||||
|
.dropout(0.1)
|
||||||
|
.unk_token("[UNK]".into())
|
||||||
|
.build()?;
|
||||||
|
|
||||||
```Rust
|
let mut tokenizer = Tokenizer::new(bpe);
|
||||||
use tokenizers::models::bpe::{BpeTrainerBuilder, BPE};
|
|
||||||
use tokenizers::Result;
|
|
||||||
use tokenizers::normalizers::{strip::Strip, unicode::NFC, utils::Sequence};
|
|
||||||
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
|
|
||||||
use tokenizers::tokenizer::{AddedToken, Tokenizer, Trainer};
|
|
||||||
|
|
||||||
use std::path::Path;
|
let encoding = tokenizer.encode("Hey there!", false)?;
|
||||||
|
println!("{:?}", encoding.get_tokens());
|
||||||
fn main() -> Result<()>{
|
|
||||||
let vocab_size: usize = 100;
|
|
||||||
|
|
||||||
let trainer: Box<dyn Trainer> = Box::new(
|
|
||||||
BpeTrainerBuilder::new()
|
|
||||||
.show_progress(true)
|
|
||||||
.vocab_size(vocab_size)
|
|
||||||
.min_frequency(0)
|
|
||||||
.special_tokens(vec![
|
|
||||||
AddedToken::from("<s>", true),
|
|
||||||
AddedToken::from("<pad>", true),
|
|
||||||
AddedToken::from("</s>", true),
|
|
||||||
AddedToken::from("<unk>", true),
|
|
||||||
AddedToken::from("<mask>", true),
|
|
||||||
])
|
|
||||||
.build(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let mut tokenizer = Tokenizer::new(Box::new(BPE::default()));
|
|
||||||
tokenizer.with_normalizer(Box::new(Sequence::new(vec![
|
|
||||||
Box::new(Strip::new(true, true)),
|
|
||||||
Box::new(NFC),
|
|
||||||
])));
|
|
||||||
tokenizer.with_pre_tokenizer(Box::new(ByteLevel::default()));
|
|
||||||
|
|
||||||
tokenizer.train(&trainer, vec!["/path/to/train.txt".to_string()])?;
|
|
||||||
tokenizer.save("/path/to/trained_tokenizer", true)?;
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Deserialize a pretrained Tokenizer.
|
### Training and serialization example
|
||||||
|
|
||||||
```Rust
|
```rust
|
||||||
use tokenizers::Result;
|
use tokenizers::decoders::DecoderWrapper;
|
||||||
use tokenizers::tokenizer::Tokenizer;
|
use tokenizers::models::bpe::{BpeTrainerBuilder, BPE};
|
||||||
|
use tokenizers::normalizers::{strip::Strip, unicode::NFC, utils::Sequence, NormalizerWrapper};
|
||||||
|
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
|
||||||
|
use tokenizers::pre_tokenizers::PreTokenizerWrapper;
|
||||||
|
use tokenizers::processors::PostProcessorWrapper;
|
||||||
|
use tokenizers::{AddedToken, Model, Result, TokenizerBuilder};
|
||||||
|
|
||||||
fn main() -> Result<()>{
|
use std::path::Path;
|
||||||
|
|
||||||
let tokenizer = Tokenizer::from_file("/path/to/trained_tokenizer")?;
|
fn main() -> Result<()> {
|
||||||
|
let vocab_size: usize = 100;
|
||||||
|
|
||||||
let sample_encoding = tokenizer.encode("Huggingface", false)?;
|
let trainer = BpeTrainerBuilder::new()
|
||||||
|
.show_progress(true)
|
||||||
|
.vocab_size(vocab_size)
|
||||||
|
.min_frequency(0)
|
||||||
|
.special_tokens(vec![
|
||||||
|
AddedToken::from(String::from("<s>"), true),
|
||||||
|
AddedToken::from(String::from("<pad>"), true),
|
||||||
|
AddedToken::from(String::from("</s>"), true),
|
||||||
|
AddedToken::from(String::from("<unk>"), true),
|
||||||
|
AddedToken::from(String::from("<mask>"), true),
|
||||||
|
])
|
||||||
|
.build();
|
||||||
|
|
||||||
println!("{:?}", sample_encoding);
|
let tokenizer = TokenizerBuilder::new()
|
||||||
|
.with_model(BPE::default())
|
||||||
|
.with_normalizer(Some(Sequence::new(vec![
|
||||||
|
NormalizerWrapper::StripNormalizer(Strip::new(true, true)),
|
||||||
|
NormalizerWrapper::NFC(NFC),
|
||||||
|
])))
|
||||||
|
.with_pretokenizer(Some(PreTokenizerWrapper::ByteLevel(ByteLevel::default())))
|
||||||
|
.with_postprocessor(Some(PostProcessorWrapper::ByteLevel(ByteLevel::default())))
|
||||||
|
.with_decoder(Some(DecoderWrapper::ByteLevel(ByteLevel::default())))
|
||||||
|
.build()?;
|
||||||
|
|
||||||
|
tokenizer
|
||||||
|
.train(
|
||||||
|
&trainer,
|
||||||
|
vec!["path/to/vocab.txt".to_string()],
|
||||||
|
)?
|
||||||
|
.get_model()
|
||||||
|
.save(Path::new("result-folder"), Some("some-prefix"))?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
18
tokenizers/README.tpl
Normal file
18
tokenizers/README.tpl
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
<p align="center">
|
||||||
|
<br>
|
||||||
|
<img src="https://huggingface.co/landing/assets/tokenizers/tokenizers-logo.png" width="600"/>
|
||||||
|
<br>
|
||||||
|
<p>
|
||||||
|
<p align="center">
|
||||||
|
<img alt="Build" src="https://github.com/huggingface/tokenizers/workflows/Rust/badge.svg">
|
||||||
|
<a href="https://github.com/huggingface/tokenizers/blob/master/LICENSE">
|
||||||
|
<img alt="GitHub" src="https://img.shields.io/github/license/huggingface/tokenizers.svg?color=blue">
|
||||||
|
</a>
|
||||||
|
<a href="https://docs.rs/tokenizers/">
|
||||||
|
<img alt="Doc" src="https://docs.rs/tokenizers/badge.svg">
|
||||||
|
</a>
|
||||||
|
</p>
|
||||||
|
<br>
|
||||||
|
|
||||||
|
|
||||||
|
{{readme}}
|
||||||
@@ -2,15 +2,13 @@
|
|||||||
#![doc(html_favicon_url = "https://huggingface.co/favicon.ico")]
|
#![doc(html_favicon_url = "https://huggingface.co/favicon.ico")]
|
||||||
#![doc(html_logo_url = "https://huggingface.co/landing/assets/huggingface_logo.svg")]
|
#![doc(html_logo_url = "https://huggingface.co/landing/assets/huggingface_logo.svg")]
|
||||||
|
|
||||||
//! # Tokenizers
|
//! The core of `tokenizers`, written in Rust.
|
||||||
//!
|
|
||||||
//! Provides an implementation of today's most used tokenizers, with a focus on performance and
|
//! Provides an implementation of today's most used tokenizers, with a focus on performance and
|
||||||
//! versatility.
|
//! versatility.
|
||||||
//!
|
//!
|
||||||
//! ## What is a Tokenizer
|
//! # What is a Tokenizer
|
||||||
//!
|
//!
|
||||||
//! A Tokenizer works as a pipeline, it processes some raw text as input and outputs an
|
//! A Tokenizer works as a pipeline, it processes some raw text as input and outputs an `Encoding`.
|
||||||
//! `Encoding`.
|
|
||||||
//! The various steps of the pipeline are:
|
//! The various steps of the pipeline are:
|
||||||
//!
|
//!
|
||||||
//! 1. The `Normalizer`: in charge of normalizing the text. Common examples of normalization are
|
//! 1. The `Normalizer`: in charge of normalizing the text. Common examples of normalization are
|
||||||
@@ -22,7 +20,7 @@
|
|||||||
//! 4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything relevant
|
//! 4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything relevant
|
||||||
//! that, for example, a language model would need, such as special tokens.
|
//! that, for example, a language model would need, such as special tokens.
|
||||||
//!
|
//!
|
||||||
//! ## Quick example
|
//! ## Deserialization and tokenization example
|
||||||
//!
|
//!
|
||||||
//! ```no_run
|
//! ```no_run
|
||||||
//! use tokenizers::tokenizer::{Result, Tokenizer, EncodeInput};
|
//! use tokenizers::tokenizer::{Result, Tokenizer, EncodeInput};
|
||||||
@@ -43,6 +41,65 @@
|
|||||||
//! Ok(())
|
//! Ok(())
|
||||||
//! }
|
//! }
|
||||||
//! ```
|
//! ```
|
||||||
|
//!
|
||||||
|
//! ## Training and serialization example
|
||||||
|
//!
|
||||||
|
//! ```no_run
|
||||||
|
//! use tokenizers::decoders::DecoderWrapper;
|
||||||
|
//! use tokenizers::models::bpe::{BpeTrainerBuilder, BPE};
|
||||||
|
//! use tokenizers::normalizers::{strip::Strip, unicode::NFC, utils::Sequence, NormalizerWrapper};
|
||||||
|
//! use tokenizers::pre_tokenizers::byte_level::ByteLevel;
|
||||||
|
//! use tokenizers::pre_tokenizers::PreTokenizerWrapper;
|
||||||
|
//! use tokenizers::processors::PostProcessorWrapper;
|
||||||
|
//! use tokenizers::{AddedToken, Model, Result, TokenizerBuilder};
|
||||||
|
//!
|
||||||
|
//! use std::path::Path;
|
||||||
|
//!
|
||||||
|
//! fn main() -> Result<()> {
|
||||||
|
//! let vocab_size: usize = 100;
|
||||||
|
//!
|
||||||
|
//! let trainer = BpeTrainerBuilder::new()
|
||||||
|
//! .show_progress(true)
|
||||||
|
//! .vocab_size(vocab_size)
|
||||||
|
//! .min_frequency(0)
|
||||||
|
//! .special_tokens(vec![
|
||||||
|
//! AddedToken::from(String::from("<s>"), true),
|
||||||
|
//! AddedToken::from(String::from("<pad>"), true),
|
||||||
|
//! AddedToken::from(String::from("</s>"), true),
|
||||||
|
//! AddedToken::from(String::from("<unk>"), true),
|
||||||
|
//! AddedToken::from(String::from("<mask>"), true),
|
||||||
|
//! ])
|
||||||
|
//! .build();
|
||||||
|
//!
|
||||||
|
//! let tokenizer = TokenizerBuilder::new()
|
||||||
|
//! .with_model(BPE::default())
|
||||||
|
//! .with_normalizer(Some(Sequence::new(vec![
|
||||||
|
//! NormalizerWrapper::StripNormalizer(Strip::new(true, true)),
|
||||||
|
//! NormalizerWrapper::NFC(NFC),
|
||||||
|
//! ])))
|
||||||
|
//! .with_pretokenizer(Some(PreTokenizerWrapper::ByteLevel(ByteLevel::default())))
|
||||||
|
//! .with_postprocessor(Some(PostProcessorWrapper::ByteLevel(ByteLevel::default())))
|
||||||
|
//! .with_decoder(Some(DecoderWrapper::ByteLevel(ByteLevel::default())))
|
||||||
|
//! .build()?;
|
||||||
|
//!
|
||||||
|
//! tokenizer
|
||||||
|
//! .train(
|
||||||
|
//! &trainer,
|
||||||
|
//! vec!["path/to/vocab.txt".to_string()],
|
||||||
|
//! )?
|
||||||
|
//! .get_model()
|
||||||
|
//! .save(Path::new("result-folder"), Some("some-prefix"))?;
|
||||||
|
//!
|
||||||
|
//! Ok(())
|
||||||
|
//! }
|
||||||
|
//! ```
|
||||||
|
//!
|
||||||
|
//! # Additional information
|
||||||
|
//!
|
||||||
|
//! - tokenizers is designed to leverage CPU parallelism when possible. The level of parallelism is determined
|
||||||
|
//! by the total number of core/threads your CPU provides but this can be tuned by setting the `RAYON_RS_NUM_CPUS`
|
||||||
|
//! environment variable. As an example setting `RAYON_RS_NUM_CPUS=4` will allocate a maximum of 4 threads.
|
||||||
|
//! **_Please note this behavior may evolve in the future_**
|
||||||
|
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate log;
|
extern crate log;
|
||||||
|
|||||||
Reference in New Issue
Block a user