Update Python bindings with new interface

This commit is contained in:
Anthony MOI
2020-07-29 16:45:55 -04:00
committed by Anthony MOI
parent 261a0c6dd8
commit 7833965dc4
8 changed files with 101 additions and 243 deletions

View File

@ -281,6 +281,15 @@ dependencies = [
"either", "either",
] ]
[[package]]
name = "itertools"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b"
dependencies = [
"either",
]
[[package]] [[package]]
name = "itoa" name = "itoa"
version = "0.4.5" version = "0.4.5"
@ -584,7 +593,7 @@ version = "0.1.0"
source = "git+https://github.com/n1t0/rayon-cond#c56e4f1ded0fcb92eac70e0533703bba3ca2983f" source = "git+https://github.com/n1t0/rayon-cond#c56e4f1ded0fcb92eac70e0533703bba3ca2983f"
dependencies = [ dependencies = [
"either", "either",
"itertools", "itertools 0.8.2",
"rayon", "rayon",
] ]
@ -749,6 +758,7 @@ version = "0.10.1"
dependencies = [ dependencies = [
"clap", "clap",
"indicatif", "indicatif",
"itertools 0.9.0",
"lazy_static", "lazy_static",
"onig", "onig",
"rand", "rand",

View File

@ -1,5 +1,4 @@
from tokenizers import Tokenizer, Encoding, AddedToken, InputSequence, EncodeInput from tokenizers import Tokenizer, Encoding, AddedToken, InputSequence, EncodeInput
from tokenizers.models import TokenizedSequence, TokenizedSequenceWithOffsets
from typing import List, Union, Tuple, Optional, Dict from typing import List, Union, Tuple, Optional, Dict

View File

@ -2,9 +2,6 @@ from typing import List, Tuple
from .. import models, Offsets from .. import models, Offsets
TokenizedSequence = List[str]
TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
Model = models.Model Model = models.Model
BPE = models.BPE BPE = models.BPE
WordPiece = models.WordPiece WordPiece = models.WordPiece

View File

@ -1,9 +1,6 @@
from .. import Encoding, Offsets from .. import Encoding, Offsets
from typing import List, Optional, Union, Tuple from typing import List, Optional, Union, Tuple
TokenizedSequence = List[str]
TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
class Model: class Model:
""" Base class for all models """ Base class for all models
@ -19,57 +16,6 @@ class Model:
Any file with the same name that already exist in this folder will be overwritten. Any file with the same name that already exist in this folder will be overwritten.
""" """
pass pass
def encode(
self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
) -> Encoding:
""" Encode the given sequence.
A sequence can either be:
- `TokenizedSequence`: (`List[str]`)
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
a Tuple[int, int].
If the Offsets are not provided, they will be automatically generated, making the hypothesis
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
Args:
sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
Either a TokenizedSequence or a TokenizedSequenceWithOffsets
type_id: int:
The type id of the given sequence
Returns:
An Encoding
"""
pass
def encode_batch(
self,
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
type_id: int = 0,
) -> List[Encoding]:
""" Encode the given batch of sequence.
A sequence can either be:
- `TokenizedSequence`: (`List[str]`)
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
a Tuple[int, int].
If the Offsets are not provided, they will be automatically generated, making the hypothesis
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
Args:
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
A list of sequence. Each sequence is either a TokenizedSequence or a
TokenizedSequenceWithOffsets
type_id: int:
The type if of the given sequence
Returns:
A list of Encoding
"""
pass
class BPE(Model): class BPE(Model):
"""BytePairEncoding model class """BytePairEncoding model class

View File

@ -75,11 +75,7 @@ impl Encoding {
#[args(growing_offsets = true)] #[args(growing_offsets = true)]
fn merge(encodings: Vec<PyRef<Encoding>>, growing_offsets: bool) -> Encoding { fn merge(encodings: Vec<PyRef<Encoding>>, growing_offsets: bool) -> Encoding {
tk::tokenizer::Encoding::merge( tk::tokenizer::Encoding::merge(
encodings encodings.into_iter().map(|e| e.encoding.clone()),
.into_iter()
.map(|e| e.encoding.clone())
.collect::<Vec<_>>()
.as_slice(),
growing_offsets, growing_offsets,
) )
.into() .into()

View File

@ -27,3 +27,8 @@ impl<T> std::convert::Into<PyResult<T>> for ToPyResult<T> {
.map_err(|e| exceptions::Exception::py_err(format!("{}", e))) .map_err(|e| exceptions::Exception::py_err(format!("{}", e)))
} }
} }
impl<T> ToPyResult<T> {
pub fn into_py(self) -> PyResult<T> {
self.into()
}
}

View File

@ -1,78 +1,11 @@
extern crate tokenizers as tk; extern crate tokenizers as tk;
use super::encoding::Encoding;
use super::error::ToPyResult; use super::error::ToPyResult;
use super::utils::Container; use super::utils::Container;
use pyo3::exceptions; use pyo3::exceptions;
use pyo3::prelude::*; use pyo3::prelude::*;
use pyo3::types::*; use pyo3::types::*;
use std::path::Path; use std::path::Path;
use tk::parallelism::*;
#[pyclass]
struct EncodeInput {
sequence: Vec<(String, (usize, usize))>,
}
impl EncodeInput {
pub fn into_input(self) -> Vec<(String, (usize, usize))> {
self.sequence
}
}
impl<'source> FromPyObject<'source> for EncodeInput {
fn extract(ob: &'source PyAny) -> PyResult<Self> {
let sequence: &PyList = ob.downcast()?;
enum Mode {
NoOffsets,
Offsets,
};
let mode = sequence
.iter()
.next()
.map(|item| {
if item.extract::<String>().is_ok() {
Ok(Mode::NoOffsets)
} else if item.extract::<(String, (usize, usize))>().is_ok() {
Ok(Mode::Offsets)
} else {
Err(exceptions::ValueError::py_err(
"Input must be a list[str] or list[(str, (int, int))]",
))
}
})
.unwrap()?;
let mut total_len = 0;
let sequence = sequence
.iter()
.enumerate()
.map(|(i, item)| match mode {
Mode::NoOffsets => item
.extract::<String>()
.map_err(|_| {
exceptions::ValueError::py_err(format!(
"Value at index {} should be a `str`",
i
))
})
.map(|s| {
let len = s.chars().count();
total_len += len;
(s, (total_len - len, total_len))
}),
Mode::Offsets => item.extract::<(String, (usize, usize))>().map_err(|_| {
exceptions::ValueError::py_err(format!(
"Value at index {} should be a `(str, (int, int))`",
i
))
}),
})
.collect::<Result<Vec<_>, PyErr>>()?;
Ok(EncodeInput { sequence })
}
}
/// A Model represents some tokenization algorithm like BPE or Word /// A Model represents some tokenization algorithm like BPE or Word
/// This class cannot be constructed directly. Please use one of the concrete models. /// This class cannot be constructed directly. Please use one of the concrete models.
@ -133,42 +66,6 @@ impl Model {
.map(|path| path.to_string_lossy().into_owned()) .map(|path| path.to_string_lossy().into_owned())
.collect()) .collect())
} }
#[args(type_id = 0)]
fn encode(&self, sequence: EncodeInput, type_id: u32) -> PyResult<Encoding> {
let sequence = sequence.into_input();
if sequence.is_empty() {
return Ok(tk::tokenizer::Encoding::default().into());
}
ToPyResult(self.model.execute(|model| {
model
.tokenize(sequence)
.map(|tokens| tk::tokenizer::Encoding::from_tokens(tokens, type_id).into())
}))
.into()
}
#[args(type_id = 0)]
fn encode_batch(&self, sequences: Vec<EncodeInput>, type_id: u32) -> PyResult<Vec<Encoding>> {
ToPyResult(self.model.execute(|model| {
sequences
.into_maybe_par_iter()
.map(|sequence| {
let sequence = sequence.into_input();
if sequence.is_empty() {
Ok(tk::tokenizer::Encoding::default().into())
} else {
model.tokenize(sequence).map(|tokens| {
tk::tokenizer::Encoding::from_tokens(tokens, type_id).into()
})
}
})
.collect::<Result<_, _>>()
}))
.into()
}
} }
/// BPE Model /// BPE Model

View File

@ -1,12 +1,11 @@
extern crate tokenizers as tk; extern crate tokenizers as tk;
use super::error::{PyError, ToPyResult}; use super::error::ToPyResult;
use super::utils::Container; use super::utils::Container;
use pyo3::exceptions; use pyo3::exceptions;
use pyo3::prelude::*; use pyo3::prelude::*;
use pyo3::types::*; use pyo3::types::*;
use serde::{Deserialize, Deserializer, Serialize, Serializer}; use tk::tokenizer::Offsets;
use tk::tokenizer::{Offsets, Result};
#[pyclass(dict, module = "tokenizers.pre_tokenizers")] #[pyclass(dict, module = "tokenizers.pre_tokenizers")]
pub struct PreTokenizer { pub struct PreTokenizer {
@ -14,13 +13,13 @@ pub struct PreTokenizer {
} }
#[pymethods] #[pymethods]
impl PreTokenizer { impl PreTokenizer {
#[staticmethod] // #[staticmethod]
fn custom(pretok: PyObject) -> PyResult<Self> { // fn custom(pretok: PyObject) -> PyResult<Self> {
let py_pretok = PyPreTokenizer::new(pretok)?; // let py_pretok = PyPreTokenizer::new(pretok)?;
Ok(PreTokenizer { // Ok(PreTokenizer {
pretok: Container::Owned(Box::new(py_pretok)), // pretok: Container::Owned(Box::new(py_pretok)),
}) // })
} // }
fn __getstate__(&self, py: Python) -> PyResult<PyObject> { fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
let data = self let data = self
@ -52,13 +51,20 @@ impl PreTokenizer {
} }
fn pre_tokenize(&self, s: &str) -> PyResult<Vec<(String, Offsets)>> { fn pre_tokenize(&self, s: &str) -> PyResult<Vec<(String, Offsets)>> {
// TODO: Expose the NormalizedString // TODO: Expose the PreTokenizedString
let mut normalized = tk::tokenizer::NormalizedString::from(s); let mut pretokenized = tk::tokenizer::PreTokenizedString::from(s);
ToPyResult( ToPyResult(
self.pretok self.pretok
.execute(|pretok| pretok.pre_tokenize(&mut normalized)), .execute(|pretok| pretok.pre_tokenize(&mut pretokenized)),
) )
.into() .into_py()?;
Ok(pretokenized
.get_normalized(true)
.into_iter()
.map(|(s, o)| (s.to_owned(), o))
.collect())
} }
} }
@ -108,7 +114,9 @@ impl Whitespace {
Ok(( Ok((
Whitespace {}, Whitespace {},
PreTokenizer { PreTokenizer {
pretok: Container::Owned(Box::new(tk::pre_tokenizers::whitespace::Whitespace)), pretok: Container::Owned(Box::new(
tk::pre_tokenizers::whitespace::Whitespace::default(),
)),
}, },
)) ))
} }
@ -209,64 +217,64 @@ impl Metaspace {
} }
} }
struct PyPreTokenizer { // struct PyPreTokenizer {
class: PyObject, // class: PyObject,
} // }
//
impl PyPreTokenizer { // impl PyPreTokenizer {
pub fn new(class: PyObject) -> PyResult<Self> { // pub fn new(class: PyObject) -> PyResult<Self> {
Ok(PyPreTokenizer { class }) // Ok(PyPreTokenizer { class })
} // }
} // }
//
#[typetag::serde] // #[typetag::serde]
impl tk::tokenizer::PreTokenizer for PyPreTokenizer { // impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
fn pre_tokenize( // fn pre_tokenize(
&self, // &self,
sentence: &mut tk::tokenizer::NormalizedString, // sentence: &mut tk::tokenizer::NormalizedString,
) -> Result<Vec<(String, Offsets)>> { // ) -> Result<Vec<(String, Offsets)>> {
let gil = Python::acquire_gil(); // let gil = Python::acquire_gil();
let py = gil.python(); // let py = gil.python();
//
let args = PyTuple::new(py, &[sentence.get()]); // let args = PyTuple::new(py, &[sentence.get()]);
match self.class.call_method(py, "pre_tokenize", args, None) { // match self.class.call_method(py, "pre_tokenize", args, None) {
Ok(res) => Ok(res // Ok(res) => Ok(res
.cast_as::<PyList>(py) // .cast_as::<PyList>(py)
.map_err(|_| { // .map_err(|_| {
PyError::from("`pre_tokenize is expected to return a List[(str, (uint, uint))]") // PyError::from("`pre_tokenize is expected to return a List[(str, (uint, uint))]")
})? // })?
.extract::<Vec<(String, Offsets)>>() // .extract::<Vec<(String, Offsets)>>()
.map_err(|_| { // .map_err(|_| {
PyError::from( // PyError::from(
"`pre_tokenize` is expected to return a List[(str, (uint, uint))]", // "`pre_tokenize` is expected to return a List[(str, (uint, uint))]",
) // )
})?), // })?),
Err(e) => { // Err(e) => {
e.print(py); // e.print(py);
Err(Box::new(PyError::from( // Err(Box::new(PyError::from(
"Error while calling `pre_tokenize`", // "Error while calling `pre_tokenize`",
))) // )))
} // }
} // }
} // }
} // }
//
impl Serialize for PyPreTokenizer { // impl Serialize for PyPreTokenizer {
fn serialize<S>(&self, _serializer: S) -> std::result::Result<S::Ok, S::Error> // fn serialize<S>(&self, _serializer: S) -> std::result::Result<S::Ok, S::Error>
where // where
S: Serializer, // S: Serializer,
{ // {
Err(serde::ser::Error::custom( // Err(serde::ser::Error::custom(
"Custom PyPreTokenizer cannot be serialized", // "Custom PyPreTokenizer cannot be serialized",
)) // ))
} // }
} // }
//
impl<'de> Deserialize<'de> for PyPreTokenizer { // impl<'de> Deserialize<'de> for PyPreTokenizer {
fn deserialize<D>(_deserializer: D) -> std::result::Result<Self, D::Error> // fn deserialize<D>(_deserializer: D) -> std::result::Result<Self, D::Error>
where // where
D: Deserializer<'de>, // D: Deserializer<'de>,
{ // {
unimplemented!("PyPreTokenizer cannot be deserialized") // unimplemented!("PyPreTokenizer cannot be deserialized")
} // }
} // }