Python - Update bindings

This commit is contained in:
Anthony MOI
2020-03-09 18:37:03 -04:00
parent 6a50ecfa5c
commit 7e9003ccb7
10 changed files with 43 additions and 67 deletions

View File

@@ -121,7 +121,7 @@ you need together:
#### Use a pre-trained tokenizer
```python
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, normalizers, processors
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
# Load a BPE Model
vocab = "./path/to/vocab.json"
@@ -132,8 +132,7 @@ bpe = models.BPE.from_files(vocab, merges)
tokenizer = Tokenizer(bpe)
# Customize pre-tokenization and decoding
tokenizer.normalizer = normalizers.ByteLevel(add_prefix_space=True)
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel()
@@ -159,8 +158,7 @@ from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, pr
tokenizer = Tokenizer(models.BPE.empty())
# Customize pre-tokenization and decoding
tokenizer.normalizer = normalizers.ByteLevel(add_prefix_space=True)
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel()

View File

@@ -60,10 +60,8 @@ if args.type == "gpt2":
# Create a Tokenizer using BPE
tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges))
# Use ByteLevel Normalizer
tok_r.normalizer = normalizers.ByteLevel(add_prefix_space=False)
# Use ByteLevel PreTokenizer
tok_r.pre_tokenizer = pre_tokenizers.ByteLevel()
tok_r.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
# Use ByteLevel Decoder
tok_r.decoder = decoders.ByteLevel()
elif args.type == "bert":

View File

@@ -78,7 +78,6 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<normalizers::Sequence>()?;
m.add_class::<normalizers::Lowercase>()?;
m.add_class::<normalizers::Strip>()?;
m.add_class::<normalizers::ByteLevel>()?;
Ok(())
}

View File

@@ -158,30 +158,3 @@ impl Strip {
}))
}
}
#[pyclass(extends=Normalizer)]
pub struct ByteLevel {}
#[pymethods]
impl ByteLevel {
#[new]
#[args(kwargs = "**")]
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
let mut add_prefix_space = true;
if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
let key: &str = key.extract()?;
match key {
"add_prefix_space" => add_prefix_space = value.extract()?,
_ => println!("Ignored unknown kwargs option {}", key),
}
}
}
Ok(obj.init(Normalizer {
normalizer: Container::Owned(Box::new(tk::normalizers::byte_level::ByteLevel::new(
add_prefix_space,
))),
}))
}
}

View File

@@ -22,7 +22,13 @@ impl PreTokenizer {
}
fn pre_tokenize(&self, s: &str) -> PyResult<Vec<(String, Offsets)>> {
ToPyResult(self.pretok.execute(|pretok| pretok.pre_tokenize(s))).into()
// TODO: Expose the NormalizedString
let mut normalized = tk::tokenizer::NormalizedString::from(s);
ToPyResult(
self.pretok
.execute(|pretok| pretok.pre_tokenize(&mut normalized)),
)
.into()
}
}
@@ -31,10 +37,23 @@ pub struct ByteLevel {}
#[pymethods]
impl ByteLevel {
#[new]
fn new(obj: &PyRawObject) -> PyResult<()> {
#[args(kwargs = "**")]
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
let mut add_prefix_space = true;
if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
let key: &str = key.extract()?;
match key {
"add_prefix_space" => add_prefix_space = value.extract()?,
_ => println!("Ignored unknown kwargs option {}", key),
}
}
}
Ok(obj.init(PreTokenizer {
pretok: Container::Owned(Box::new(tk::pre_tokenizers::byte_level::ByteLevel::new(
false,
add_prefix_space,
))),
}))
}
@@ -151,11 +170,14 @@ impl PyPreTokenizer {
}
impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
fn pre_tokenize(&self, sentence: &str) -> Result<Vec<(String, Offsets)>> {
fn pre_tokenize(
&self,
sentence: &mut tk::tokenizer::NormalizedString,
) -> Result<Vec<(String, Offsets)>> {
let gil = Python::acquire_gil();
let py = gil.python();
let args = PyTuple::new(py, &[sentence]);
let args = PyTuple::new(py, &[sentence.get()]);
match self.class.call_method(py, "pre_tokenize", args, None) {
Ok(res) => Ok(res
.cast_as::<PyList>(py)

View File

@@ -1,4 +1,3 @@
import tokenizers
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers, processors
from tokenizers.models import BPE
from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
@@ -37,9 +36,9 @@ class ByteLevelBPETokenizer(BaseTokenizer):
else:
tokenizer = Tokenizer(BPE.empty())
normalizers = [tokenizers.normalizers.ByteLevel(add_prefix_space=add_prefix_space)]
# Check for Unicode normalization first (before everything else)
normalizers = []
if unicode_normalizer:
normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
@@ -53,7 +52,7 @@ class ByteLevelBPETokenizer(BaseTokenizer):
else:
tokenizer.normalizer = normalizers[0]
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel()

View File

@@ -9,7 +9,6 @@ NFKC = normalizers.NFKC
Sequence = normalizers.Sequence
Lowercase = normalizers.Lowercase
Strip = normalizers.Strip
ByteLevel = normalizers.ByteLevel
NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}

View File

@@ -98,22 +98,6 @@ class Strip(Normalizer):
def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
pass
class ByteLevel(Normalizer):
""" ByteLevel normalizer """
def __init__(self, add_prefix_space: bool = True) -> Normalizer:
""" Instantiate a new ByteLevel Normalizer
Args:
add_prefix_space: (`optional`) boolean:
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
Returns:
Normalizer
"""
pass
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
"""
Instanciate unicode normalizer from the normalizer name

View File

@@ -20,8 +20,15 @@ class ByteLevel(PreTokenizer):
with a corresponding representation, as well as splitting into words.
"""
def __init__(self) -> None:
""" Instantiate a new ByteLevel PreTokenizer """
def __init__(self, add_prefix_space: bool = True) -> None:
""" Instantiate a new ByteLevel PreTokenizer
Args:
add_prefix_space: (`optional`) boolean:
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
Returns:
PreTokenizer
"""
pass
@staticmethod
def alphabet() -> List[str]:

View File

@@ -2,6 +2,3 @@ pub mod bert;
pub mod strip;
pub mod unicode;
pub mod utils;
// Re-export these as normalizers
pub use super::pre_tokenizers::byte_level;