mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Merge branch 'master' into BPE-tests
This commit is contained in:
6
bindings/python/Cargo.lock
generated
6
bindings/python/Cargo.lock
generated
@ -448,15 +448,15 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tokenizers"
|
name = "tokenizers"
|
||||||
version = "0.0.7"
|
version = "0.0.8"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pyo3 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"pyo3 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"tokenizers-lib 0.0.7",
|
"tokenizers-lib 0.0.8",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tokenizers-lib"
|
name = "tokenizers-lib"
|
||||||
version = "0.0.7"
|
version = "0.0.8"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tokenizers"
|
name = "tokenizers"
|
||||||
version = "0.0.7"
|
version = "0.0.8"
|
||||||
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
|
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
|
||||||
|
@ -61,7 +61,7 @@ bpe = models.BPE.from_files(vocab, merges)
|
|||||||
tokenizer = Tokenizer(bpe)
|
tokenizer = Tokenizer(bpe)
|
||||||
|
|
||||||
# Customize pre-tokenization and decoding
|
# Customize pre-tokenization and decoding
|
||||||
tokenizer.with_pre_tokenizer(pre_tokenizers.ByteLevel.new())
|
tokenizer.with_pre_tokenizer(pre_tokenizers.ByteLevel.new(True))
|
||||||
tokenizer.with_decoder(decoders.ByteLevel.new())
|
tokenizer.with_decoder(decoders.ByteLevel.new())
|
||||||
|
|
||||||
# And then encode:
|
# And then encode:
|
||||||
@ -85,7 +85,7 @@ from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers
|
|||||||
tokenizer = Tokenizer(models.BPE.empty())
|
tokenizer = Tokenizer(models.BPE.empty())
|
||||||
|
|
||||||
# Customize pre-tokenization and decoding
|
# Customize pre-tokenization and decoding
|
||||||
tokenizer.with_pre_tokenizer(pre_tokenizers.ByteLevel.new())
|
tokenizer.with_pre_tokenizer(pre_tokenizers.ByteLevel.new(True))
|
||||||
tokenizer.with_decoder(decoders.ByteLevel.new())
|
tokenizer.with_decoder(decoders.ByteLevel.new())
|
||||||
|
|
||||||
# And then train
|
# And then train
|
||||||
|
@ -3,7 +3,7 @@ from setuptools_rust import Binding, RustExtension
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="tokenizers",
|
name="tokenizers",
|
||||||
version="0.0.7",
|
version="0.0.8",
|
||||||
description="Fast and Customizable Tokenizers",
|
description="Fast and Customizable Tokenizers",
|
||||||
long_description=open("README.md", "r", encoding="utf-8").read(),
|
long_description=open("README.md", "r", encoding="utf-8").read(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
|
@ -36,9 +36,8 @@ impl Tokenizer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[getter]
|
fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
|
||||||
fn get_vocab_size(&self) -> usize {
|
self.tokenizer.get_vocab_size(with_added_tokens)
|
||||||
self.tokenizer.get_vocab_size()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn with_model(&mut self, model: &mut Model) -> PyResult<()> {
|
fn with_model(&mut self, model: &mut Model) -> PyResult<()> {
|
||||||
@ -192,12 +191,16 @@ impl Tokenizer {
|
|||||||
.into()
|
.into()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn decode(&self, ids: Vec<u32>) -> PyResult<String> {
|
fn decode(&self, ids: Vec<u32>, skip_special_tokens: bool) -> PyResult<String> {
|
||||||
ToPyResult(self.tokenizer.decode(ids)).into()
|
ToPyResult(self.tokenizer.decode(ids, skip_special_tokens)).into()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn decode_batch(&self, sentences: Vec<Vec<u32>>) -> PyResult<Vec<String>> {
|
fn decode_batch(
|
||||||
ToPyResult(self.tokenizer.decode_batch(sentences)).into()
|
&self,
|
||||||
|
sentences: Vec<Vec<u32>>,
|
||||||
|
skip_special_tokens: bool,
|
||||||
|
) -> PyResult<Vec<String>> {
|
||||||
|
ToPyResult(self.tokenizer.decode_batch(sentences, skip_special_tokens)).into()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn token_to_id(&self, token: &str) -> Option<u32> {
|
fn token_to_id(&self, token: &str) -> Option<u32> {
|
||||||
@ -233,6 +236,10 @@ impl Tokenizer {
|
|||||||
Ok(self.tokenizer.add_tokens(&tokens))
|
Ok(self.tokenizer.add_tokens(&tokens))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn add_special_tokens(&mut self, tokens: Vec<&str>) -> PyResult<usize> {
|
||||||
|
Ok(self.tokenizer.add_special_tokens(&tokens))
|
||||||
|
}
|
||||||
|
|
||||||
fn train(&mut self, trainer: &Trainer, files: Vec<String>) -> PyResult<()> {
|
fn train(&mut self, trainer: &Trainer, files: Vec<String>) -> PyResult<()> {
|
||||||
trainer.trainer.execute(|trainer| {
|
trainer.trainer.execute(|trainer| {
|
||||||
if let Err(e) = self.tokenizer.train(trainer, files) {
|
if let Err(e) = self.tokenizer.train(trainer, files) {
|
||||||
|
2
tokenizers/Cargo.lock
generated
2
tokenizers/Cargo.lock
generated
@ -351,7 +351,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tokenizers-lib"
|
name = "tokenizers-lib"
|
||||||
version = "0.0.7"
|
version = "0.0.8"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
|
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
name = "tokenizers-lib"
|
name = "tokenizers-lib"
|
||||||
version = "0.0.7"
|
version = "0.0.8"
|
||||||
|
|
||||||
[lib]
|
[lib]
|
||||||
name = "tokenizers"
|
name = "tokenizers"
|
||||||
|
@ -53,7 +53,7 @@ fn shell(matches: &ArgMatches) -> Result<()> {
|
|||||||
println!("Offsets:\t{:?}", encoded.get_offsets());
|
println!("Offsets:\t{:?}", encoded.get_offsets());
|
||||||
println!(
|
println!(
|
||||||
"Decoded:\t{}",
|
"Decoded:\t{}",
|
||||||
tokenizer.decode(encoded.get_ids().to_vec()).unwrap()
|
tokenizer.decode(encoded.get_ids().to_vec(), true).unwrap()
|
||||||
);
|
);
|
||||||
println!("Tokenized in {:?}", elapsed);
|
println!("Tokenized in {:?}", elapsed);
|
||||||
}
|
}
|
||||||
|
@ -93,6 +93,14 @@ pub struct AddedToken {
|
|||||||
/// Whether this token must be a single word or can break words
|
/// Whether this token must be a single word or can break words
|
||||||
pub single_word: bool,
|
pub single_word: bool,
|
||||||
}
|
}
|
||||||
|
impl AddedToken {
|
||||||
|
fn from(content: String) -> Self {
|
||||||
|
AddedToken {
|
||||||
|
content,
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
impl Default for AddedToken {
|
impl Default for AddedToken {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
AddedToken {
|
AddedToken {
|
||||||
@ -132,6 +140,7 @@ pub struct Tokenizer {
|
|||||||
added_tokens: HashMap<AddedToken, u32>,
|
added_tokens: HashMap<AddedToken, u32>,
|
||||||
added_tokens_r: HashMap<u32, AddedToken>,
|
added_tokens_r: HashMap<u32, AddedToken>,
|
||||||
split_re: Option<regex::Regex>,
|
split_re: Option<regex::Regex>,
|
||||||
|
special_tokens: HashMap<String, u32>,
|
||||||
|
|
||||||
// General processing parameters
|
// General processing parameters
|
||||||
trunc: Option<TruncationParams>,
|
trunc: Option<TruncationParams>,
|
||||||
@ -151,6 +160,7 @@ impl Tokenizer {
|
|||||||
added_tokens: HashMap::new(),
|
added_tokens: HashMap::new(),
|
||||||
added_tokens_r: HashMap::new(),
|
added_tokens_r: HashMap::new(),
|
||||||
split_re: None,
|
split_re: None,
|
||||||
|
special_tokens: HashMap::new(),
|
||||||
|
|
||||||
trunc: None,
|
trunc: None,
|
||||||
padding: None,
|
padding: None,
|
||||||
@ -200,18 +210,31 @@ impl Tokenizer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Get the size of the vocabulary
|
/// Get the size of the vocabulary
|
||||||
pub fn get_vocab_size(&self) -> usize {
|
pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
|
||||||
self.model.get_vocab_size()
|
self.model.get_vocab_size()
|
||||||
|
+ if with_added_tokens {
|
||||||
|
self.added_tokens.len()
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Converts a token in the corresponding id.
|
/// Converts a token in the corresponding id.
|
||||||
pub fn token_to_id(&self, token: &str) -> Option<u32> {
|
pub fn token_to_id(&self, token: &str) -> Option<u32> {
|
||||||
self.model.token_to_id(token)
|
if let Some(id) = self.added_tokens.get(&AddedToken::from(token.to_owned())) {
|
||||||
|
Some(*id)
|
||||||
|
} else {
|
||||||
|
self.model.token_to_id(token)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Converts an id to the corresponding token.
|
/// Converts an id to the corresponding token.
|
||||||
pub fn id_to_token(&self, id: u32) -> Option<String> {
|
pub fn id_to_token(&self, id: u32) -> Option<String> {
|
||||||
self.model.id_to_token(id)
|
if let Some(token) = self.added_tokens_r.get(&id) {
|
||||||
|
Some(token.content.clone())
|
||||||
|
} else {
|
||||||
|
self.model.id_to_token(id)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Encode the given sentence
|
/// Encode the given sentence
|
||||||
@ -324,15 +347,19 @@ impl Tokenizer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Decode the given ids, back to a String
|
/// Decode the given ids, back to a String
|
||||||
pub fn decode(&self, ids: Vec<u32>) -> Result<String> {
|
pub fn decode(&self, ids: Vec<u32>, skip_special_tokens: bool) -> Result<String> {
|
||||||
let tokens = ids
|
let tokens = ids
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|id| {
|
.map(|id| {
|
||||||
if let Some(token) = self.added_tokens_r.get(&id) {
|
let token = if let Some(token) = self.added_tokens_r.get(&id) {
|
||||||
Some(token.content.to_owned())
|
Some(token.content.to_owned())
|
||||||
} else {
|
} else {
|
||||||
self.model.id_to_token(id)
|
self.model.id_to_token(id)
|
||||||
}
|
};
|
||||||
|
|
||||||
|
token.filter(|token| {
|
||||||
|
!skip_special_tokens || !self.special_tokens.contains_key(token)
|
||||||
|
})
|
||||||
})
|
})
|
||||||
.filter(|token| token.is_some())
|
.filter(|token| token.is_some())
|
||||||
.map(|id| id.unwrap())
|
.map(|id| id.unwrap())
|
||||||
@ -346,10 +373,14 @@ impl Tokenizer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Decode all sentences in parallel
|
/// Decode all sentences in parallel
|
||||||
pub fn decode_batch(&self, sentences: Vec<Vec<u32>>) -> Result<Vec<String>> {
|
pub fn decode_batch(
|
||||||
|
&self,
|
||||||
|
sentences: Vec<Vec<u32>>,
|
||||||
|
skip_special_tokens: bool,
|
||||||
|
) -> Result<Vec<String>> {
|
||||||
sentences
|
sentences
|
||||||
.into_par_iter()
|
.into_par_iter()
|
||||||
.map(|sentence| self.decode(sentence))
|
.map(|sentence| self.decode(sentence, skip_special_tokens))
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -467,11 +498,29 @@ impl Tokenizer {
|
|||||||
Ok(final_encoding)
|
Ok(final_encoding)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Register the given tokens as special tokens. This is especially useful for removing
|
||||||
|
/// these special tokens while decoding
|
||||||
|
pub fn add_special_tokens(&mut self, tokens: &[&str]) -> usize {
|
||||||
|
let added_tokens = tokens
|
||||||
|
.iter()
|
||||||
|
.map(|t| AddedToken::from((*t).to_owned()))
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let added = self.add_tokens(&added_tokens);
|
||||||
|
for token in tokens {
|
||||||
|
if let Some(id) = self.token_to_id(token) {
|
||||||
|
self.special_tokens.entry((*token).to_owned()).or_insert(id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
added
|
||||||
|
}
|
||||||
|
|
||||||
/// Add the given tokens to the added vocabulary
|
/// Add the given tokens to the added vocabulary
|
||||||
pub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize {
|
pub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize {
|
||||||
let mut ignored = 0;
|
let mut ignored = 0;
|
||||||
for token in tokens {
|
for token in tokens {
|
||||||
if token.content.is_empty() {
|
if token.content.is_empty() || self.token_to_id(&token.content).is_some() {
|
||||||
ignored += 1;
|
ignored += 1;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user