mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-04 08:19:21 +00:00
Python - Rewrite PyDecoder and PyPreTokenizer
This commit is contained in:
@ -55,46 +55,27 @@ struct PyDecoder {
|
||||
|
||||
impl PyDecoder {
|
||||
pub fn new(class: PyObject) -> PyResult<Self> {
|
||||
let decoder = PyDecoder { class };
|
||||
|
||||
// Quickly test the PyDecoder
|
||||
decoder._decode(vec![
|
||||
"This".into(),
|
||||
"is".into(),
|
||||
"a".into(),
|
||||
"sentence".into(),
|
||||
])?;
|
||||
|
||||
Ok(decoder)
|
||||
}
|
||||
|
||||
fn _decode(&self, tokens: Vec<String>) -> PyResult<String> {
|
||||
let gil = Python::acquire_gil();
|
||||
let py = gil.python();
|
||||
|
||||
let args = PyTuple::new(py, &[tokens]);
|
||||
let res = self.class.call_method(py, "decode", args, None)?;
|
||||
|
||||
let decoded = res
|
||||
.cast_as::<PyString>(py)
|
||||
.map_err(|_| exceptions::TypeError::py_err("`decode` is expected to return a str"))?;
|
||||
|
||||
Ok(decoded.to_string()?.into_owned())
|
||||
Ok(PyDecoder { class })
|
||||
}
|
||||
}
|
||||
|
||||
impl tk::tokenizer::Decoder for PyDecoder {
|
||||
fn decode(&self, tokens: Vec<String>) -> String {
|
||||
match self._decode(tokens) {
|
||||
Ok(res) => res,
|
||||
Err(e) => {
|
||||
fn decode(&self, tokens: Vec<String>) -> Result<String> {
|
||||
let gil = Python::acquire_gil();
|
||||
let py = gil.python();
|
||||
e.print(py);
|
||||
|
||||
// Return an empty string as fallback
|
||||
String::from("")
|
||||
}
|
||||
let args = PyTuple::new(py, &[tokens]);
|
||||
match self.class.call_method(py, "decode", args, None) {
|
||||
Ok(res) => Ok(res
|
||||
.cast_as::<PyString>(py)
|
||||
.map_err(|_| PyError::from("`decode` is expected to return a str"))?
|
||||
.to_string()
|
||||
.map_err(|_| PyError::from("`decode` is expected to return a str"))?
|
||||
.into_owned()),
|
||||
Err(e) => Err(Box::new(PyError(format!(
|
||||
"Error while calling `decode`: {:?}",
|
||||
e
|
||||
)))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -66,44 +66,26 @@ struct PyPreTokenizer {
|
||||
|
||||
impl PyPreTokenizer {
|
||||
pub fn new(class: PyObject) -> PyResult<Self> {
|
||||
let pretok = PyPreTokenizer { class };
|
||||
|
||||
// Quickly test the PyPreTokenizer
|
||||
pretok._pre_tokenize("This is a test sentence")?;
|
||||
|
||||
Ok(pretok)
|
||||
}
|
||||
|
||||
fn _pre_tokenize(&self, sentence: &str) -> PyResult<Vec<String>> {
|
||||
let gil = Python::acquire_gil();
|
||||
let py = gil.python();
|
||||
|
||||
let args = PyTuple::new(py, &[sentence]);
|
||||
let res = self.class.call_method(py, "pre_tokenize", args, None)?;
|
||||
|
||||
let tokens = res.cast_as::<PyList>(py).map_err(|_| {
|
||||
exceptions::TypeError::py_err("`pre_tokenize` is expected to return a List[str]`")
|
||||
})?;
|
||||
let tokens: Vec<String> = tokens.extract().map_err(|_| {
|
||||
exceptions::TypeError::py_err("`pre_tokenize` is expected to return a List[str]`")
|
||||
})?;
|
||||
|
||||
Ok(tokens)
|
||||
Ok(PyPreTokenizer { class })
|
||||
}
|
||||
}
|
||||
|
||||
impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
|
||||
fn pre_tokenize(&self, sentence: &str) -> Vec<String> {
|
||||
match self._pre_tokenize(sentence) {
|
||||
Ok(res) => res,
|
||||
Err(e) => {
|
||||
fn pre_tokenize(&self, sentence: &str) -> Result<Vec<String>> {
|
||||
let gil = Python::acquire_gil();
|
||||
let py = gil.python();
|
||||
e.print(py);
|
||||
|
||||
// Return an empty Vec as fallback
|
||||
vec![]
|
||||
}
|
||||
let args = PyTuple::new(py, &[sentence]);
|
||||
match self.class.call_method(py, "pre_tokenize", args, None) {
|
||||
Ok(res) => Ok(res
|
||||
.cast_as::<PyList>(py)
|
||||
.map_err(|_| PyError::from("`pre_tokenize is expected to return a List[str]"))?
|
||||
.extract::<Vec<String>>()
|
||||
.map_err(|_| PyError::from("`pre_tokenize` is expected to return a List[str]"))?),
|
||||
Err(e) => Err(Box::new(PyError(format!(
|
||||
"Error while calling `pre_tokenize`: {:?}",
|
||||
e
|
||||
)))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user