mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-07 21:28:19 +00:00
Fix split on special tokens & bump version
This commit is contained in:
6
bindings/python/Cargo.lock
generated
6
bindings/python/Cargo.lock
generated
@@ -555,7 +555,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tokenizers"
|
name = "tokenizers"
|
||||||
version = "0.6.0"
|
version = "0.6.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"indicatif 0.13.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"indicatif 0.13.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
@@ -571,10 +571,10 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tokenizers-python"
|
name = "tokenizers-python"
|
||||||
version = "0.1.0"
|
version = "0.1.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pyo3 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
"pyo3 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"tokenizers 0.6.0",
|
"tokenizers 0.6.1",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tokenizers-python"
|
name = "tokenizers-python"
|
||||||
version = "0.1.0"
|
version = "0.1.1"
|
||||||
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
|
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ from setuptools_rust import Binding, RustExtension
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="tokenizers",
|
name="tokenizers",
|
||||||
version="0.1.0",
|
version="0.1.1",
|
||||||
description="Fast and Customizable Tokenizers",
|
description="Fast and Customizable Tokenizers",
|
||||||
long_description=open("README.md", "r", encoding="utf-8").read(),
|
long_description=open("README.md", "r", encoding="utf-8").read(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
__version__ = "0.1.0"
|
__version__ = "0.1.1"
|
||||||
|
|
||||||
from .tokenizers import Tokenizer, Encoding
|
from .tokenizers import Tokenizer, Encoding
|
||||||
from .tokenizers import decoders
|
from .tokenizers import decoders
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
|
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
name = "tokenizers"
|
name = "tokenizers"
|
||||||
version = "0.6.0"
|
version = "0.6.1"
|
||||||
homepage = "https://github.com/huggingface/tokenizers"
|
homepage = "https://github.com/huggingface/tokenizers"
|
||||||
repository = "https://github.com/huggingface/tokenizers"
|
repository = "https://github.com/huggingface/tokenizers"
|
||||||
documentation = "https://docs.rs/tokenizers/"
|
documentation = "https://docs.rs/tokenizers/"
|
||||||
|
|||||||
@@ -565,6 +565,8 @@ impl Tokenizer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
self.refresh_added_tokens();
|
||||||
|
|
||||||
added
|
added
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -591,11 +593,27 @@ impl Tokenizer {
|
|||||||
.or_insert_with(|| token.clone());
|
.or_insert_with(|| token.clone());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
self.refresh_added_tokens();
|
||||||
|
|
||||||
|
// Return the number of added tokens
|
||||||
|
tokens.len() - ignored
|
||||||
|
}
|
||||||
|
|
||||||
|
fn refresh_added_tokens(&mut self) {
|
||||||
// We rebuild the regex here everytime on purpose, because the added tokens may
|
// We rebuild the regex here everytime on purpose, because the added tokens may
|
||||||
// have changed
|
// have changed
|
||||||
|
let special_tokens = self
|
||||||
|
.special_tokens
|
||||||
|
.keys()
|
||||||
|
.map(|t| AddedToken {
|
||||||
|
content: t.to_owned(),
|
||||||
|
single_word: true,
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
let added_tokens = self
|
let added_tokens = self
|
||||||
.added_tokens
|
.added_tokens
|
||||||
.keys()
|
.keys()
|
||||||
|
.chain(special_tokens.iter())
|
||||||
.map(|token| {
|
.map(|token| {
|
||||||
if token.single_word {
|
if token.single_word {
|
||||||
let first_b = token
|
let first_b = token
|
||||||
@@ -635,9 +653,6 @@ impl Tokenizer {
|
|||||||
self.split_re =
|
self.split_re =
|
||||||
Some(regex::Regex::new(&format!(r"({})", added_tokens.join("|"))).unwrap());
|
Some(regex::Regex::new(&format!(r"({})", added_tokens.join("|"))).unwrap());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return the number of added tokens
|
|
||||||
tokens.len() - ignored
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Split the given sentence on multiple parts, finding the added tokens and their id in the process
|
/// Split the given sentence on multiple parts, finding the added tokens and their id in the process
|
||||||
@@ -677,10 +692,13 @@ impl Tokenizer {
|
|||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(start, end)| unsafe {
|
.map(|(start, end)| unsafe {
|
||||||
let s = sentence.get_unchecked(start..end).to_owned();
|
let s = sentence.get_unchecked(start..end).to_owned();
|
||||||
let id = self.added_tokens.get(&AddedToken {
|
let mut id = self.special_tokens.get(&s);
|
||||||
|
if id.is_none() {
|
||||||
|
id = self.added_tokens.get(&AddedToken {
|
||||||
content: s.clone(),
|
content: s.clone(),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
});
|
});
|
||||||
|
}
|
||||||
(s, id.copied())
|
(s, id.copied())
|
||||||
})
|
})
|
||||||
.collect()
|
.collect()
|
||||||
|
|||||||
Reference in New Issue
Block a user