mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Python - Custom PreTokenizer backbone
This commit is contained in:
@ -7,6 +7,16 @@ use pyo3::prelude::*;
|
||||
pub struct PreTokenizer {
|
||||
pub pretok: Container<dyn tk::tokenizer::PreTokenizer + Sync>,
|
||||
}
|
||||
#[pymethods]
|
||||
impl PreTokenizer {
|
||||
#[staticmethod]
|
||||
fn from_python(pretok: PyObject) -> PyResult<Self> {
|
||||
let py_pretok = PyPreTokenizer::new(pretok)?;
|
||||
Ok(PreTokenizer {
|
||||
pretok: Container::Owned(Box::new(py_pretok)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct ByteLevel {}
|
||||
@ -19,3 +29,21 @@ impl ByteLevel {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Attempt at providing Python the ability to give its own PreTokenizer
|
||||
struct PyPreTokenizer {
|
||||
class: PyObject,
|
||||
}
|
||||
|
||||
impl PyPreTokenizer {
|
||||
pub fn new(class: PyObject) -> PyResult<Self> {
|
||||
// test the given PyObject
|
||||
Ok(PyPreTokenizer { class })
|
||||
}
|
||||
}
|
||||
|
||||
impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
|
||||
fn pre_tokenize(&self, sentence: &str) -> Vec<String> {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user