mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Python - Custom PreTokenizer backbone
This commit is contained in:
@ -7,6 +7,16 @@ use pyo3::prelude::*;
|
|||||||
pub struct PreTokenizer {
|
pub struct PreTokenizer {
|
||||||
pub pretok: Container<dyn tk::tokenizer::PreTokenizer + Sync>,
|
pub pretok: Container<dyn tk::tokenizer::PreTokenizer + Sync>,
|
||||||
}
|
}
|
||||||
|
#[pymethods]
|
||||||
|
impl PreTokenizer {
|
||||||
|
#[staticmethod]
|
||||||
|
fn from_python(pretok: PyObject) -> PyResult<Self> {
|
||||||
|
let py_pretok = PyPreTokenizer::new(pretok)?;
|
||||||
|
Ok(PreTokenizer {
|
||||||
|
pretok: Container::Owned(Box::new(py_pretok)),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[pyclass]
|
#[pyclass]
|
||||||
pub struct ByteLevel {}
|
pub struct ByteLevel {}
|
||||||
@ -19,3 +29,21 @@ impl ByteLevel {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Attempt at providing Python the ability to give its own PreTokenizer
|
||||||
|
struct PyPreTokenizer {
|
||||||
|
class: PyObject,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PyPreTokenizer {
|
||||||
|
pub fn new(class: PyObject) -> PyResult<Self> {
|
||||||
|
// test the given PyObject
|
||||||
|
Ok(PyPreTokenizer { class })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
|
||||||
|
fn pre_tokenize(&self, sentence: &str) -> Vec<String> {
|
||||||
|
unimplemented!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user