Update Python bindings with new interface

This commit is contained in:
Anthony MOI
2020-07-29 16:45:55 -04:00
committed by Anthony MOI
parent 261a0c6dd8
commit 7833965dc4
8 changed files with 101 additions and 243 deletions

View File

@ -281,6 +281,15 @@ dependencies = [
"either",
]
[[package]]
name = "itertools"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "0.4.5"
@ -584,7 +593,7 @@ version = "0.1.0"
source = "git+https://github.com/n1t0/rayon-cond#c56e4f1ded0fcb92eac70e0533703bba3ca2983f"
dependencies = [
"either",
"itertools",
"itertools 0.8.2",
"rayon",
]
@ -749,6 +758,7 @@ version = "0.10.1"
dependencies = [
"clap",
"indicatif",
"itertools 0.9.0",
"lazy_static",
"onig",
"rand",

View File

@ -1,5 +1,4 @@
from tokenizers import Tokenizer, Encoding, AddedToken, InputSequence, EncodeInput
from tokenizers.models import TokenizedSequence, TokenizedSequenceWithOffsets
from typing import List, Union, Tuple, Optional, Dict

View File

@ -2,9 +2,6 @@ from typing import List, Tuple
from .. import models, Offsets
TokenizedSequence = List[str]
TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
Model = models.Model
BPE = models.BPE
WordPiece = models.WordPiece

View File

@ -1,9 +1,6 @@
from .. import Encoding, Offsets
from typing import List, Optional, Union, Tuple
TokenizedSequence = List[str]
TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
class Model:
""" Base class for all models
@ -19,57 +16,6 @@ class Model:
Any file with the same name that already exist in this folder will be overwritten.
"""
pass
def encode(
self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
) -> Encoding:
""" Encode the given sequence.
A sequence can either be:
- `TokenizedSequence`: (`List[str]`)
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
a Tuple[int, int].
If the Offsets are not provided, they will be automatically generated, making the hypothesis
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
Args:
sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
Either a TokenizedSequence or a TokenizedSequenceWithOffsets
type_id: int:
The type id of the given sequence
Returns:
An Encoding
"""
pass
def encode_batch(
self,
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
type_id: int = 0,
) -> List[Encoding]:
""" Encode the given batch of sequence.
A sequence can either be:
- `TokenizedSequence`: (`List[str]`)
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
a Tuple[int, int].
If the Offsets are not provided, they will be automatically generated, making the hypothesis
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
Args:
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
A list of sequence. Each sequence is either a TokenizedSequence or a
TokenizedSequenceWithOffsets
type_id: int:
The type if of the given sequence
Returns:
A list of Encoding
"""
pass
class BPE(Model):
"""BytePairEncoding model class

View File

@ -75,11 +75,7 @@ impl Encoding {
#[args(growing_offsets = true)]
fn merge(encodings: Vec<PyRef<Encoding>>, growing_offsets: bool) -> Encoding {
tk::tokenizer::Encoding::merge(
encodings
.into_iter()
.map(|e| e.encoding.clone())
.collect::<Vec<_>>()
.as_slice(),
encodings.into_iter().map(|e| e.encoding.clone()),
growing_offsets,
)
.into()

View File

@ -27,3 +27,8 @@ impl<T> std::convert::Into<PyResult<T>> for ToPyResult<T> {
.map_err(|e| exceptions::Exception::py_err(format!("{}", e)))
}
}
impl<T> ToPyResult<T> {
pub fn into_py(self) -> PyResult<T> {
self.into()
}
}

View File

@ -1,78 +1,11 @@
extern crate tokenizers as tk;
use super::encoding::Encoding;
use super::error::ToPyResult;
use super::utils::Container;
use pyo3::exceptions;
use pyo3::prelude::*;
use pyo3::types::*;
use std::path::Path;
use tk::parallelism::*;
#[pyclass]
struct EncodeInput {
sequence: Vec<(String, (usize, usize))>,
}
impl EncodeInput {
pub fn into_input(self) -> Vec<(String, (usize, usize))> {
self.sequence
}
}
impl<'source> FromPyObject<'source> for EncodeInput {
fn extract(ob: &'source PyAny) -> PyResult<Self> {
let sequence: &PyList = ob.downcast()?;
enum Mode {
NoOffsets,
Offsets,
};
let mode = sequence
.iter()
.next()
.map(|item| {
if item.extract::<String>().is_ok() {
Ok(Mode::NoOffsets)
} else if item.extract::<(String, (usize, usize))>().is_ok() {
Ok(Mode::Offsets)
} else {
Err(exceptions::ValueError::py_err(
"Input must be a list[str] or list[(str, (int, int))]",
))
}
})
.unwrap()?;
let mut total_len = 0;
let sequence = sequence
.iter()
.enumerate()
.map(|(i, item)| match mode {
Mode::NoOffsets => item
.extract::<String>()
.map_err(|_| {
exceptions::ValueError::py_err(format!(
"Value at index {} should be a `str`",
i
))
})
.map(|s| {
let len = s.chars().count();
total_len += len;
(s, (total_len - len, total_len))
}),
Mode::Offsets => item.extract::<(String, (usize, usize))>().map_err(|_| {
exceptions::ValueError::py_err(format!(
"Value at index {} should be a `(str, (int, int))`",
i
))
}),
})
.collect::<Result<Vec<_>, PyErr>>()?;
Ok(EncodeInput { sequence })
}
}
/// A Model represents some tokenization algorithm like BPE or Word
/// This class cannot be constructed directly. Please use one of the concrete models.
@ -133,42 +66,6 @@ impl Model {
.map(|path| path.to_string_lossy().into_owned())
.collect())
}
#[args(type_id = 0)]
fn encode(&self, sequence: EncodeInput, type_id: u32) -> PyResult<Encoding> {
let sequence = sequence.into_input();
if sequence.is_empty() {
return Ok(tk::tokenizer::Encoding::default().into());
}
ToPyResult(self.model.execute(|model| {
model
.tokenize(sequence)
.map(|tokens| tk::tokenizer::Encoding::from_tokens(tokens, type_id).into())
}))
.into()
}
#[args(type_id = 0)]
fn encode_batch(&self, sequences: Vec<EncodeInput>, type_id: u32) -> PyResult<Vec<Encoding>> {
ToPyResult(self.model.execute(|model| {
sequences
.into_maybe_par_iter()
.map(|sequence| {
let sequence = sequence.into_input();
if sequence.is_empty() {
Ok(tk::tokenizer::Encoding::default().into())
} else {
model.tokenize(sequence).map(|tokens| {
tk::tokenizer::Encoding::from_tokens(tokens, type_id).into()
})
}
})
.collect::<Result<_, _>>()
}))
.into()
}
}
/// BPE Model

View File

@ -1,12 +1,11 @@
extern crate tokenizers as tk;
use super::error::{PyError, ToPyResult};
use super::error::ToPyResult;
use super::utils::Container;
use pyo3::exceptions;
use pyo3::prelude::*;
use pyo3::types::*;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use tk::tokenizer::{Offsets, Result};
use tk::tokenizer::Offsets;
#[pyclass(dict, module = "tokenizers.pre_tokenizers")]
pub struct PreTokenizer {
@ -14,13 +13,13 @@ pub struct PreTokenizer {
}
#[pymethods]
impl PreTokenizer {
#[staticmethod]
fn custom(pretok: PyObject) -> PyResult<Self> {
let py_pretok = PyPreTokenizer::new(pretok)?;
Ok(PreTokenizer {
pretok: Container::Owned(Box::new(py_pretok)),
})
}
// #[staticmethod]
// fn custom(pretok: PyObject) -> PyResult<Self> {
// let py_pretok = PyPreTokenizer::new(pretok)?;
// Ok(PreTokenizer {
// pretok: Container::Owned(Box::new(py_pretok)),
// })
// }
fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
let data = self
@ -52,13 +51,20 @@ impl PreTokenizer {
}
fn pre_tokenize(&self, s: &str) -> PyResult<Vec<(String, Offsets)>> {
// TODO: Expose the NormalizedString
let mut normalized = tk::tokenizer::NormalizedString::from(s);
// TODO: Expose the PreTokenizedString
let mut pretokenized = tk::tokenizer::PreTokenizedString::from(s);
ToPyResult(
self.pretok
.execute(|pretok| pretok.pre_tokenize(&mut normalized)),
.execute(|pretok| pretok.pre_tokenize(&mut pretokenized)),
)
.into()
.into_py()?;
Ok(pretokenized
.get_normalized(true)
.into_iter()
.map(|(s, o)| (s.to_owned(), o))
.collect())
}
}
@ -108,7 +114,9 @@ impl Whitespace {
Ok((
Whitespace {},
PreTokenizer {
pretok: Container::Owned(Box::new(tk::pre_tokenizers::whitespace::Whitespace)),
pretok: Container::Owned(Box::new(
tk::pre_tokenizers::whitespace::Whitespace::default(),
)),
},
))
}
@ -209,64 +217,64 @@ impl Metaspace {
}
}
struct PyPreTokenizer {
class: PyObject,
}
impl PyPreTokenizer {
pub fn new(class: PyObject) -> PyResult<Self> {
Ok(PyPreTokenizer { class })
}
}
#[typetag::serde]
impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
fn pre_tokenize(
&self,
sentence: &mut tk::tokenizer::NormalizedString,
) -> Result<Vec<(String, Offsets)>> {
let gil = Python::acquire_gil();
let py = gil.python();
let args = PyTuple::new(py, &[sentence.get()]);
match self.class.call_method(py, "pre_tokenize", args, None) {
Ok(res) => Ok(res
.cast_as::<PyList>(py)
.map_err(|_| {
PyError::from("`pre_tokenize is expected to return a List[(str, (uint, uint))]")
})?
.extract::<Vec<(String, Offsets)>>()
.map_err(|_| {
PyError::from(
"`pre_tokenize` is expected to return a List[(str, (uint, uint))]",
)
})?),
Err(e) => {
e.print(py);
Err(Box::new(PyError::from(
"Error while calling `pre_tokenize`",
)))
}
}
}
}
impl Serialize for PyPreTokenizer {
fn serialize<S>(&self, _serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: Serializer,
{
Err(serde::ser::Error::custom(
"Custom PyPreTokenizer cannot be serialized",
))
}
}
impl<'de> Deserialize<'de> for PyPreTokenizer {
fn deserialize<D>(_deserializer: D) -> std::result::Result<Self, D::Error>
where
D: Deserializer<'de>,
{
unimplemented!("PyPreTokenizer cannot be deserialized")
}
}
// struct PyPreTokenizer {
// class: PyObject,
// }
//
// impl PyPreTokenizer {
// pub fn new(class: PyObject) -> PyResult<Self> {
// Ok(PyPreTokenizer { class })
// }
// }
//
// #[typetag::serde]
// impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
// fn pre_tokenize(
// &self,
// sentence: &mut tk::tokenizer::NormalizedString,
// ) -> Result<Vec<(String, Offsets)>> {
// let gil = Python::acquire_gil();
// let py = gil.python();
//
// let args = PyTuple::new(py, &[sentence.get()]);
// match self.class.call_method(py, "pre_tokenize", args, None) {
// Ok(res) => Ok(res
// .cast_as::<PyList>(py)
// .map_err(|_| {
// PyError::from("`pre_tokenize is expected to return a List[(str, (uint, uint))]")
// })?
// .extract::<Vec<(String, Offsets)>>()
// .map_err(|_| {
// PyError::from(
// "`pre_tokenize` is expected to return a List[(str, (uint, uint))]",
// )
// })?),
// Err(e) => {
// e.print(py);
// Err(Box::new(PyError::from(
// "Error while calling `pre_tokenize`",
// )))
// }
// }
// }
// }
//
// impl Serialize for PyPreTokenizer {
// fn serialize<S>(&self, _serializer: S) -> std::result::Result<S::Ok, S::Error>
// where
// S: Serializer,
// {
// Err(serde::ser::Error::custom(
// "Custom PyPreTokenizer cannot be serialized",
// ))
// }
// }
//
// impl<'de> Deserialize<'de> for PyPreTokenizer {
// fn deserialize<D>(_deserializer: D) -> std::result::Result<Self, D::Error>
// where
// D: Deserializer<'de>,
// {
// unimplemented!("PyPreTokenizer cannot be deserialized")
// }
// }