mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-07 21:28:19 +00:00
Python - Update bindings
This commit is contained in:
@@ -121,7 +121,7 @@ you need together:
|
||||
#### Use a pre-trained tokenizer
|
||||
|
||||
```python
|
||||
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, normalizers, processors
|
||||
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
|
||||
|
||||
# Load a BPE Model
|
||||
vocab = "./path/to/vocab.json"
|
||||
@@ -132,8 +132,7 @@ bpe = models.BPE.from_files(vocab, merges)
|
||||
tokenizer = Tokenizer(bpe)
|
||||
|
||||
# Customize pre-tokenization and decoding
|
||||
tokenizer.normalizer = normalizers.ByteLevel(add_prefix_space=True)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
|
||||
tokenizer.decoder = decoders.ByteLevel()
|
||||
tokenizer.post_processor = processors.ByteLevel()
|
||||
|
||||
@@ -159,8 +158,7 @@ from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, pr
|
||||
tokenizer = Tokenizer(models.BPE.empty())
|
||||
|
||||
# Customize pre-tokenization and decoding
|
||||
tokenizer.normalizer = normalizers.ByteLevel(add_prefix_space=True)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
|
||||
tokenizer.decoder = decoders.ByteLevel()
|
||||
tokenizer.post_processor = processors.ByteLevel()
|
||||
|
||||
|
||||
@@ -60,10 +60,8 @@ if args.type == "gpt2":
|
||||
|
||||
# Create a Tokenizer using BPE
|
||||
tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges))
|
||||
# Use ByteLevel Normalizer
|
||||
tok_r.normalizer = normalizers.ByteLevel(add_prefix_space=False)
|
||||
# Use ByteLevel PreTokenizer
|
||||
tok_r.pre_tokenizer = pre_tokenizers.ByteLevel()
|
||||
tok_r.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
|
||||
# Use ByteLevel Decoder
|
||||
tok_r.decoder = decoders.ByteLevel()
|
||||
elif args.type == "bert":
|
||||
|
||||
@@ -78,7 +78,6 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<normalizers::Sequence>()?;
|
||||
m.add_class::<normalizers::Lowercase>()?;
|
||||
m.add_class::<normalizers::Strip>()?;
|
||||
m.add_class::<normalizers::ByteLevel>()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -158,30 +158,3 @@ impl Strip {
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=Normalizer)]
|
||||
pub struct ByteLevel {}
|
||||
#[pymethods]
|
||||
impl ByteLevel {
|
||||
#[new]
|
||||
#[args(kwargs = "**")]
|
||||
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||
let mut add_prefix_space = true;
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, value) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
match key {
|
||||
"add_prefix_space" => add_prefix_space = value.extract()?,
|
||||
_ => println!("Ignored unknown kwargs option {}", key),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(obj.init(Normalizer {
|
||||
normalizer: Container::Owned(Box::new(tk::normalizers::byte_level::ByteLevel::new(
|
||||
add_prefix_space,
|
||||
))),
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,7 +22,13 @@ impl PreTokenizer {
|
||||
}
|
||||
|
||||
fn pre_tokenize(&self, s: &str) -> PyResult<Vec<(String, Offsets)>> {
|
||||
ToPyResult(self.pretok.execute(|pretok| pretok.pre_tokenize(s))).into()
|
||||
// TODO: Expose the NormalizedString
|
||||
let mut normalized = tk::tokenizer::NormalizedString::from(s);
|
||||
ToPyResult(
|
||||
self.pretok
|
||||
.execute(|pretok| pretok.pre_tokenize(&mut normalized)),
|
||||
)
|
||||
.into()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,10 +37,23 @@ pub struct ByteLevel {}
|
||||
#[pymethods]
|
||||
impl ByteLevel {
|
||||
#[new]
|
||||
fn new(obj: &PyRawObject) -> PyResult<()> {
|
||||
#[args(kwargs = "**")]
|
||||
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||
let mut add_prefix_space = true;
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, value) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
match key {
|
||||
"add_prefix_space" => add_prefix_space = value.extract()?,
|
||||
_ => println!("Ignored unknown kwargs option {}", key),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(obj.init(PreTokenizer {
|
||||
pretok: Container::Owned(Box::new(tk::pre_tokenizers::byte_level::ByteLevel::new(
|
||||
false,
|
||||
add_prefix_space,
|
||||
))),
|
||||
}))
|
||||
}
|
||||
@@ -151,11 +170,14 @@ impl PyPreTokenizer {
|
||||
}
|
||||
|
||||
impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
|
||||
fn pre_tokenize(&self, sentence: &str) -> Result<Vec<(String, Offsets)>> {
|
||||
fn pre_tokenize(
|
||||
&self,
|
||||
sentence: &mut tk::tokenizer::NormalizedString,
|
||||
) -> Result<Vec<(String, Offsets)>> {
|
||||
let gil = Python::acquire_gil();
|
||||
let py = gil.python();
|
||||
|
||||
let args = PyTuple::new(py, &[sentence]);
|
||||
let args = PyTuple::new(py, &[sentence.get()]);
|
||||
match self.class.call_method(py, "pre_tokenize", args, None) {
|
||||
Ok(res) => Ok(res
|
||||
.cast_as::<PyList>(py)
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import tokenizers
|
||||
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers, processors
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
|
||||
@@ -37,9 +36,9 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
else:
|
||||
tokenizer = Tokenizer(BPE.empty())
|
||||
|
||||
normalizers = [tokenizers.normalizers.ByteLevel(add_prefix_space=add_prefix_space)]
|
||||
|
||||
# Check for Unicode normalization first (before everything else)
|
||||
normalizers = []
|
||||
|
||||
if unicode_normalizer:
|
||||
normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
|
||||
|
||||
@@ -53,7 +52,7 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
else:
|
||||
tokenizer.normalizer = normalizers[0]
|
||||
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
|
||||
tokenizer.decoder = decoders.ByteLevel()
|
||||
tokenizer.post_processor = processors.ByteLevel()
|
||||
|
||||
|
||||
@@ -9,7 +9,6 @@ NFKC = normalizers.NFKC
|
||||
Sequence = normalizers.Sequence
|
||||
Lowercase = normalizers.Lowercase
|
||||
Strip = normalizers.Strip
|
||||
ByteLevel = normalizers.ByteLevel
|
||||
|
||||
|
||||
NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
|
||||
|
||||
@@ -98,22 +98,6 @@ class Strip(Normalizer):
|
||||
def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
|
||||
pass
|
||||
|
||||
class ByteLevel(Normalizer):
|
||||
""" ByteLevel normalizer """
|
||||
|
||||
def __init__(self, add_prefix_space: bool = True) -> Normalizer:
|
||||
""" Instantiate a new ByteLevel Normalizer
|
||||
|
||||
Args:
|
||||
add_prefix_space: (`optional`) boolean:
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
|
||||
Returns:
|
||||
Normalizer
|
||||
"""
|
||||
pass
|
||||
|
||||
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
|
||||
"""
|
||||
Instanciate unicode normalizer from the normalizer name
|
||||
|
||||
@@ -20,8 +20,15 @@ class ByteLevel(PreTokenizer):
|
||||
with a corresponding representation, as well as splitting into words.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new ByteLevel PreTokenizer """
|
||||
def __init__(self, add_prefix_space: bool = True) -> None:
|
||||
""" Instantiate a new ByteLevel PreTokenizer
|
||||
Args:
|
||||
add_prefix_space: (`optional`) boolean:
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
Returns:
|
||||
PreTokenizer
|
||||
"""
|
||||
pass
|
||||
@staticmethod
|
||||
def alphabet() -> List[str]:
|
||||
|
||||
@@ -2,6 +2,3 @@ pub mod bert;
|
||||
pub mod strip;
|
||||
pub mod unicode;
|
||||
pub mod utils;
|
||||
|
||||
// Re-export these as normalizers
|
||||
pub use super::pre_tokenizers::byte_level;
|
||||
|
||||
Reference in New Issue
Block a user