mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-09 14:18:30 +00:00
Python - Update bindings
This commit is contained in:
@@ -121,7 +121,7 @@ you need together:
|
|||||||
#### Use a pre-trained tokenizer
|
#### Use a pre-trained tokenizer
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, normalizers, processors
|
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
|
||||||
|
|
||||||
# Load a BPE Model
|
# Load a BPE Model
|
||||||
vocab = "./path/to/vocab.json"
|
vocab = "./path/to/vocab.json"
|
||||||
@@ -132,8 +132,7 @@ bpe = models.BPE.from_files(vocab, merges)
|
|||||||
tokenizer = Tokenizer(bpe)
|
tokenizer = Tokenizer(bpe)
|
||||||
|
|
||||||
# Customize pre-tokenization and decoding
|
# Customize pre-tokenization and decoding
|
||||||
tokenizer.normalizer = normalizers.ByteLevel(add_prefix_space=True)
|
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
|
||||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
|
||||||
tokenizer.decoder = decoders.ByteLevel()
|
tokenizer.decoder = decoders.ByteLevel()
|
||||||
tokenizer.post_processor = processors.ByteLevel()
|
tokenizer.post_processor = processors.ByteLevel()
|
||||||
|
|
||||||
@@ -159,8 +158,7 @@ from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, pr
|
|||||||
tokenizer = Tokenizer(models.BPE.empty())
|
tokenizer = Tokenizer(models.BPE.empty())
|
||||||
|
|
||||||
# Customize pre-tokenization and decoding
|
# Customize pre-tokenization and decoding
|
||||||
tokenizer.normalizer = normalizers.ByteLevel(add_prefix_space=True)
|
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
|
||||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
|
||||||
tokenizer.decoder = decoders.ByteLevel()
|
tokenizer.decoder = decoders.ByteLevel()
|
||||||
tokenizer.post_processor = processors.ByteLevel()
|
tokenizer.post_processor = processors.ByteLevel()
|
||||||
|
|
||||||
|
|||||||
@@ -60,10 +60,8 @@ if args.type == "gpt2":
|
|||||||
|
|
||||||
# Create a Tokenizer using BPE
|
# Create a Tokenizer using BPE
|
||||||
tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges))
|
tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges))
|
||||||
# Use ByteLevel Normalizer
|
|
||||||
tok_r.normalizer = normalizers.ByteLevel(add_prefix_space=False)
|
|
||||||
# Use ByteLevel PreTokenizer
|
# Use ByteLevel PreTokenizer
|
||||||
tok_r.pre_tokenizer = pre_tokenizers.ByteLevel()
|
tok_r.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
|
||||||
# Use ByteLevel Decoder
|
# Use ByteLevel Decoder
|
||||||
tok_r.decoder = decoders.ByteLevel()
|
tok_r.decoder = decoders.ByteLevel()
|
||||||
elif args.type == "bert":
|
elif args.type == "bert":
|
||||||
|
|||||||
@@ -78,7 +78,6 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
|||||||
m.add_class::<normalizers::Sequence>()?;
|
m.add_class::<normalizers::Sequence>()?;
|
||||||
m.add_class::<normalizers::Lowercase>()?;
|
m.add_class::<normalizers::Lowercase>()?;
|
||||||
m.add_class::<normalizers::Strip>()?;
|
m.add_class::<normalizers::Strip>()?;
|
||||||
m.add_class::<normalizers::ByteLevel>()?;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -158,30 +158,3 @@ impl Strip {
|
|||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyclass(extends=Normalizer)]
|
|
||||||
pub struct ByteLevel {}
|
|
||||||
#[pymethods]
|
|
||||||
impl ByteLevel {
|
|
||||||
#[new]
|
|
||||||
#[args(kwargs = "**")]
|
|
||||||
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
|
|
||||||
let mut add_prefix_space = true;
|
|
||||||
|
|
||||||
if let Some(kwargs) = kwargs {
|
|
||||||
for (key, value) in kwargs {
|
|
||||||
let key: &str = key.extract()?;
|
|
||||||
match key {
|
|
||||||
"add_prefix_space" => add_prefix_space = value.extract()?,
|
|
||||||
_ => println!("Ignored unknown kwargs option {}", key),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(obj.init(Normalizer {
|
|
||||||
normalizer: Container::Owned(Box::new(tk::normalizers::byte_level::ByteLevel::new(
|
|
||||||
add_prefix_space,
|
|
||||||
))),
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -22,7 +22,13 @@ impl PreTokenizer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn pre_tokenize(&self, s: &str) -> PyResult<Vec<(String, Offsets)>> {
|
fn pre_tokenize(&self, s: &str) -> PyResult<Vec<(String, Offsets)>> {
|
||||||
ToPyResult(self.pretok.execute(|pretok| pretok.pre_tokenize(s))).into()
|
// TODO: Expose the NormalizedString
|
||||||
|
let mut normalized = tk::tokenizer::NormalizedString::from(s);
|
||||||
|
ToPyResult(
|
||||||
|
self.pretok
|
||||||
|
.execute(|pretok| pretok.pre_tokenize(&mut normalized)),
|
||||||
|
)
|
||||||
|
.into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -31,10 +37,23 @@ pub struct ByteLevel {}
|
|||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl ByteLevel {
|
impl ByteLevel {
|
||||||
#[new]
|
#[new]
|
||||||
fn new(obj: &PyRawObject) -> PyResult<()> {
|
#[args(kwargs = "**")]
|
||||||
|
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||||
|
let mut add_prefix_space = true;
|
||||||
|
|
||||||
|
if let Some(kwargs) = kwargs {
|
||||||
|
for (key, value) in kwargs {
|
||||||
|
let key: &str = key.extract()?;
|
||||||
|
match key {
|
||||||
|
"add_prefix_space" => add_prefix_space = value.extract()?,
|
||||||
|
_ => println!("Ignored unknown kwargs option {}", key),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Ok(obj.init(PreTokenizer {
|
Ok(obj.init(PreTokenizer {
|
||||||
pretok: Container::Owned(Box::new(tk::pre_tokenizers::byte_level::ByteLevel::new(
|
pretok: Container::Owned(Box::new(tk::pre_tokenizers::byte_level::ByteLevel::new(
|
||||||
false,
|
add_prefix_space,
|
||||||
))),
|
))),
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
@@ -151,11 +170,14 @@ impl PyPreTokenizer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
|
impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
|
||||||
fn pre_tokenize(&self, sentence: &str) -> Result<Vec<(String, Offsets)>> {
|
fn pre_tokenize(
|
||||||
|
&self,
|
||||||
|
sentence: &mut tk::tokenizer::NormalizedString,
|
||||||
|
) -> Result<Vec<(String, Offsets)>> {
|
||||||
let gil = Python::acquire_gil();
|
let gil = Python::acquire_gil();
|
||||||
let py = gil.python();
|
let py = gil.python();
|
||||||
|
|
||||||
let args = PyTuple::new(py, &[sentence]);
|
let args = PyTuple::new(py, &[sentence.get()]);
|
||||||
match self.class.call_method(py, "pre_tokenize", args, None) {
|
match self.class.call_method(py, "pre_tokenize", args, None) {
|
||||||
Ok(res) => Ok(res
|
Ok(res) => Ok(res
|
||||||
.cast_as::<PyList>(py)
|
.cast_as::<PyList>(py)
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
import tokenizers
|
|
||||||
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers, processors
|
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers, processors
|
||||||
from tokenizers.models import BPE
|
from tokenizers.models import BPE
|
||||||
from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
|
from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
|
||||||
@@ -37,9 +36,9 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
|||||||
else:
|
else:
|
||||||
tokenizer = Tokenizer(BPE.empty())
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
|
||||||
normalizers = [tokenizers.normalizers.ByteLevel(add_prefix_space=add_prefix_space)]
|
|
||||||
|
|
||||||
# Check for Unicode normalization first (before everything else)
|
# Check for Unicode normalization first (before everything else)
|
||||||
|
normalizers = []
|
||||||
|
|
||||||
if unicode_normalizer:
|
if unicode_normalizer:
|
||||||
normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
|
normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
|
||||||
|
|
||||||
@@ -53,7 +52,7 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
|||||||
else:
|
else:
|
||||||
tokenizer.normalizer = normalizers[0]
|
tokenizer.normalizer = normalizers[0]
|
||||||
|
|
||||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
|
||||||
tokenizer.decoder = decoders.ByteLevel()
|
tokenizer.decoder = decoders.ByteLevel()
|
||||||
tokenizer.post_processor = processors.ByteLevel()
|
tokenizer.post_processor = processors.ByteLevel()
|
||||||
|
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ NFKC = normalizers.NFKC
|
|||||||
Sequence = normalizers.Sequence
|
Sequence = normalizers.Sequence
|
||||||
Lowercase = normalizers.Lowercase
|
Lowercase = normalizers.Lowercase
|
||||||
Strip = normalizers.Strip
|
Strip = normalizers.Strip
|
||||||
ByteLevel = normalizers.ByteLevel
|
|
||||||
|
|
||||||
|
|
||||||
NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
|
NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
|
||||||
|
|||||||
@@ -98,22 +98,6 @@ class Strip(Normalizer):
|
|||||||
def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
|
def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class ByteLevel(Normalizer):
|
|
||||||
""" ByteLevel normalizer """
|
|
||||||
|
|
||||||
def __init__(self, add_prefix_space: bool = True) -> Normalizer:
|
|
||||||
""" Instantiate a new ByteLevel Normalizer
|
|
||||||
|
|
||||||
Args:
|
|
||||||
add_prefix_space: (`optional`) boolean:
|
|
||||||
Whether to add a space to the first word if there isn't already one. This
|
|
||||||
lets us treat `hello` exactly like `say hello`.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Normalizer
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
|
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
|
||||||
"""
|
"""
|
||||||
Instanciate unicode normalizer from the normalizer name
|
Instanciate unicode normalizer from the normalizer name
|
||||||
|
|||||||
@@ -20,8 +20,15 @@ class ByteLevel(PreTokenizer):
|
|||||||
with a corresponding representation, as well as splitting into words.
|
with a corresponding representation, as well as splitting into words.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self, add_prefix_space: bool = True) -> None:
|
||||||
""" Instantiate a new ByteLevel PreTokenizer """
|
""" Instantiate a new ByteLevel PreTokenizer
|
||||||
|
Args:
|
||||||
|
add_prefix_space: (`optional`) boolean:
|
||||||
|
Whether to add a space to the first word if there isn't already one. This
|
||||||
|
lets us treat `hello` exactly like `say hello`.
|
||||||
|
Returns:
|
||||||
|
PreTokenizer
|
||||||
|
"""
|
||||||
pass
|
pass
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def alphabet() -> List[str]:
|
def alphabet() -> List[str]:
|
||||||
|
|||||||
@@ -2,6 +2,3 @@ pub mod bert;
|
|||||||
pub mod strip;
|
pub mod strip;
|
||||||
pub mod unicode;
|
pub mod unicode;
|
||||||
pub mod utils;
|
pub mod utils;
|
||||||
|
|
||||||
// Re-export these as normalizers
|
|
||||||
pub use super::pre_tokenizers::byte_level;
|
|
||||||
|
|||||||
Reference in New Issue
Block a user