From bd1aa80d8a6210b9591e877510ee6aa7aa8b4f02 Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Sat, 23 Nov 2019 23:59:33 -0500 Subject: [PATCH] Python - Custom PreTokenizer backbone --- bindings/python/src/pre_tokenizers.rs | 28 +++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 26eadc68..aa41be59 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -7,6 +7,16 @@ use pyo3::prelude::*; pub struct PreTokenizer { pub pretok: Container, } +#[pymethods] +impl PreTokenizer { + #[staticmethod] + fn from_python(pretok: PyObject) -> PyResult { + let py_pretok = PyPreTokenizer::new(pretok)?; + Ok(PreTokenizer { + pretok: Container::Owned(Box::new(py_pretok)), + }) + } +} #[pyclass] pub struct ByteLevel {} @@ -19,3 +29,21 @@ impl ByteLevel { }) } } + +/// Attempt at providing Python the ability to give its own PreTokenizer +struct PyPreTokenizer { + class: PyObject, +} + +impl PyPreTokenizer { + pub fn new(class: PyObject) -> PyResult { + // test the given PyObject + Ok(PyPreTokenizer { class }) + } +} + +impl tk::tokenizer::PreTokenizer for PyPreTokenizer { + fn pre_tokenize(&self, sentence: &str) -> Vec { + unimplemented!() + } +}