mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-09 06:08:22 +00:00
Python - Rewrite PyDecoder and PyPreTokenizer
This commit is contained in:
@@ -55,46 +55,27 @@ struct PyDecoder {
|
|||||||
|
|
||||||
impl PyDecoder {
|
impl PyDecoder {
|
||||||
pub fn new(class: PyObject) -> PyResult<Self> {
|
pub fn new(class: PyObject) -> PyResult<Self> {
|
||||||
let decoder = PyDecoder { class };
|
Ok(PyDecoder { class })
|
||||||
|
|
||||||
// Quickly test the PyDecoder
|
|
||||||
decoder._decode(vec![
|
|
||||||
"This".into(),
|
|
||||||
"is".into(),
|
|
||||||
"a".into(),
|
|
||||||
"sentence".into(),
|
|
||||||
])?;
|
|
||||||
|
|
||||||
Ok(decoder)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn _decode(&self, tokens: Vec<String>) -> PyResult<String> {
|
|
||||||
let gil = Python::acquire_gil();
|
|
||||||
let py = gil.python();
|
|
||||||
|
|
||||||
let args = PyTuple::new(py, &[tokens]);
|
|
||||||
let res = self.class.call_method(py, "decode", args, None)?;
|
|
||||||
|
|
||||||
let decoded = res
|
|
||||||
.cast_as::<PyString>(py)
|
|
||||||
.map_err(|_| exceptions::TypeError::py_err("`decode` is expected to return a str"))?;
|
|
||||||
|
|
||||||
Ok(decoded.to_string()?.into_owned())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl tk::tokenizer::Decoder for PyDecoder {
|
impl tk::tokenizer::Decoder for PyDecoder {
|
||||||
fn decode(&self, tokens: Vec<String>) -> String {
|
fn decode(&self, tokens: Vec<String>) -> Result<String> {
|
||||||
match self._decode(tokens) {
|
let gil = Python::acquire_gil();
|
||||||
Ok(res) => res,
|
let py = gil.python();
|
||||||
Err(e) => {
|
|
||||||
let gil = Python::acquire_gil();
|
|
||||||
let py = gil.python();
|
|
||||||
e.print(py);
|
|
||||||
|
|
||||||
// Return an empty string as fallback
|
let args = PyTuple::new(py, &[tokens]);
|
||||||
String::from("")
|
match self.class.call_method(py, "decode", args, None) {
|
||||||
}
|
Ok(res) => Ok(res
|
||||||
|
.cast_as::<PyString>(py)
|
||||||
|
.map_err(|_| PyError::from("`decode` is expected to return a str"))?
|
||||||
|
.to_string()
|
||||||
|
.map_err(|_| PyError::from("`decode` is expected to return a str"))?
|
||||||
|
.into_owned()),
|
||||||
|
Err(e) => Err(Box::new(PyError(format!(
|
||||||
|
"Error while calling `decode`: {:?}",
|
||||||
|
e
|
||||||
|
)))),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -66,44 +66,26 @@ struct PyPreTokenizer {
|
|||||||
|
|
||||||
impl PyPreTokenizer {
|
impl PyPreTokenizer {
|
||||||
pub fn new(class: PyObject) -> PyResult<Self> {
|
pub fn new(class: PyObject) -> PyResult<Self> {
|
||||||
let pretok = PyPreTokenizer { class };
|
Ok(PyPreTokenizer { class })
|
||||||
|
|
||||||
// Quickly test the PyPreTokenizer
|
|
||||||
pretok._pre_tokenize("This is a test sentence")?;
|
|
||||||
|
|
||||||
Ok(pretok)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn _pre_tokenize(&self, sentence: &str) -> PyResult<Vec<String>> {
|
|
||||||
let gil = Python::acquire_gil();
|
|
||||||
let py = gil.python();
|
|
||||||
|
|
||||||
let args = PyTuple::new(py, &[sentence]);
|
|
||||||
let res = self.class.call_method(py, "pre_tokenize", args, None)?;
|
|
||||||
|
|
||||||
let tokens = res.cast_as::<PyList>(py).map_err(|_| {
|
|
||||||
exceptions::TypeError::py_err("`pre_tokenize` is expected to return a List[str]`")
|
|
||||||
})?;
|
|
||||||
let tokens: Vec<String> = tokens.extract().map_err(|_| {
|
|
||||||
exceptions::TypeError::py_err("`pre_tokenize` is expected to return a List[str]`")
|
|
||||||
})?;
|
|
||||||
|
|
||||||
Ok(tokens)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
|
impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
|
||||||
fn pre_tokenize(&self, sentence: &str) -> Vec<String> {
|
fn pre_tokenize(&self, sentence: &str) -> Result<Vec<String>> {
|
||||||
match self._pre_tokenize(sentence) {
|
let gil = Python::acquire_gil();
|
||||||
Ok(res) => res,
|
let py = gil.python();
|
||||||
Err(e) => {
|
|
||||||
let gil = Python::acquire_gil();
|
|
||||||
let py = gil.python();
|
|
||||||
e.print(py);
|
|
||||||
|
|
||||||
// Return an empty Vec as fallback
|
let args = PyTuple::new(py, &[sentence]);
|
||||||
vec![]
|
match self.class.call_method(py, "pre_tokenize", args, None) {
|
||||||
}
|
Ok(res) => Ok(res
|
||||||
|
.cast_as::<PyList>(py)
|
||||||
|
.map_err(|_| PyError::from("`pre_tokenize is expected to return a List[str]"))?
|
||||||
|
.extract::<Vec<String>>()
|
||||||
|
.map_err(|_| PyError::from("`pre_tokenize` is expected to return a List[str]"))?),
|
||||||
|
Err(e) => Err(Box::new(PyError(format!(
|
||||||
|
"Error while calling `pre_tokenize`: {:?}",
|
||||||
|
e
|
||||||
|
)))),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user