diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs index ac732324..5e40fc82 100644 --- a/bindings/python/src/models.rs +++ b/bindings/python/src/models.rs @@ -5,7 +5,7 @@ use super::utils::Container; use pyo3::exceptions; use pyo3::prelude::*; use pyo3::types::*; -use std::path::{Path, PathBuf}; +use std::path::Path; /// A Model represents some tokenization algorithm like BPE or Word /// This class cannot be constructed directly. Please use one of the concrete models. diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index d50f1c44..a5578619 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -12,6 +12,7 @@ use super::normalizers::Normalizer; use super::pre_tokenizers::PreTokenizer; use super::processors::PostProcessor; use super::trainers::Trainer; +use super::utils::Container; use tk::tokenizer::{ PaddingDirection, PaddingParams, PaddingStrategy, TruncationParams, TruncationStrategy, @@ -301,4 +302,48 @@ impl Tokenizer { } }) } + + #[getter] + fn get_model(&self) -> PyResult { + Ok(Model { + model: Container::from_ref(self.tokenizer.get_model()), + }) + } + + #[getter] + fn get_normalizer(&self) -> PyResult> { + Ok(self + .tokenizer + .get_normalizer() + .map(|normalizer| Normalizer { + normalizer: Container::from_ref(normalizer), + })) + } + + #[getter] + fn get_pre_tokenizer(&self) -> PyResult> { + Ok(self + .tokenizer + .get_pre_tokenizer() + .map(|pretok| PreTokenizer { + pretok: Container::from_ref(pretok), + })) + } + + #[getter] + fn get_post_processor(&self) -> PyResult> { + Ok(self + .tokenizer + .get_post_processor() + .map(|processor| PostProcessor { + processor: Container::from_ref(processor), + })) + } + + #[getter] + fn get_decoder(&self) -> PyResult> { + Ok(self.tokenizer.get_decoder().map(|decoder| Decoder { + decoder: Container::from_ref(decoder), + })) + } } diff --git a/bindings/python/src/utils.rs b/bindings/python/src/utils.rs index b360a191..02b2457c 100644 --- a/bindings/python/src/utils.rs +++ b/bindings/python/src/utils.rs @@ -17,6 +17,11 @@ impl Container where T: ?Sized, { + pub fn from_ref(reference: &Box) -> Self { + let content: *const T = &**reference; + Container::Pointer(content as *mut _) + } + /// Consumes ourself and return the Boxed element if we have the ownership, None otherwise. pub fn take(self) -> Option> { match self { diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 942935ad..86f92f6c 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -165,30 +165,60 @@ impl Tokenizer { self } + /// Get the normalizer + #[allow(clippy::borrowed_box)] + pub fn get_normalizer(&self) -> Option<&Box> { + self.normalizer.as_ref() + } + /// Set the pre tokenizer pub fn with_pre_tokenizer(&mut self, pre_tokenizer: Box) -> &Self { self.pre_tokenizer = Some(pre_tokenizer); self } + /// Get the pre tokenizer + #[allow(clippy::borrowed_box)] + pub fn get_pre_tokenizer(&self) -> Option<&Box> { + self.pre_tokenizer.as_ref() + } + /// Set the post processor pub fn with_post_processor(&mut self, post_processor: Box) -> &Self { self.post_processor = Some(post_processor); self } + /// Get the post processor + #[allow(clippy::borrowed_box)] + pub fn get_post_processor(&self) -> Option<&Box> { + self.post_processor.as_ref() + } + /// Set the decoder pub fn with_decoder(&mut self, decoder: Box) -> &Self { self.decoder = Some(decoder); self } + /// Get the decoder + #[allow(clippy::borrowed_box)] + pub fn get_decoder(&self) -> Option<&Box> { + self.decoder.as_ref() + } + /// Set the model pub fn with_model(&mut self, model: Box) -> &Self { self.model = model; self } + /// Get the model + #[allow(clippy::borrowed_box)] + pub fn get_model(&self) -> &Box { + &self.model + } + /// Set the truncation parameters pub fn with_truncation(&mut self, trunc: Option) -> &Self { self.trunc = trunc;