mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-06 12:48:18 +00:00
Expose Tokenizer parts
This commit is contained in:
@@ -5,7 +5,7 @@ use super::utils::Container;
|
|||||||
use pyo3::exceptions;
|
use pyo3::exceptions;
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
use pyo3::types::*;
|
use pyo3::types::*;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::Path;
|
||||||
|
|
||||||
/// A Model represents some tokenization algorithm like BPE or Word
|
/// A Model represents some tokenization algorithm like BPE or Word
|
||||||
/// This class cannot be constructed directly. Please use one of the concrete models.
|
/// This class cannot be constructed directly. Please use one of the concrete models.
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ use super::normalizers::Normalizer;
|
|||||||
use super::pre_tokenizers::PreTokenizer;
|
use super::pre_tokenizers::PreTokenizer;
|
||||||
use super::processors::PostProcessor;
|
use super::processors::PostProcessor;
|
||||||
use super::trainers::Trainer;
|
use super::trainers::Trainer;
|
||||||
|
use super::utils::Container;
|
||||||
|
|
||||||
use tk::tokenizer::{
|
use tk::tokenizer::{
|
||||||
PaddingDirection, PaddingParams, PaddingStrategy, TruncationParams, TruncationStrategy,
|
PaddingDirection, PaddingParams, PaddingStrategy, TruncationParams, TruncationStrategy,
|
||||||
@@ -301,4 +302,48 @@ impl Tokenizer {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[getter]
|
||||||
|
fn get_model(&self) -> PyResult<Model> {
|
||||||
|
Ok(Model {
|
||||||
|
model: Container::from_ref(self.tokenizer.get_model()),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[getter]
|
||||||
|
fn get_normalizer(&self) -> PyResult<Option<Normalizer>> {
|
||||||
|
Ok(self
|
||||||
|
.tokenizer
|
||||||
|
.get_normalizer()
|
||||||
|
.map(|normalizer| Normalizer {
|
||||||
|
normalizer: Container::from_ref(normalizer),
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[getter]
|
||||||
|
fn get_pre_tokenizer(&self) -> PyResult<Option<PreTokenizer>> {
|
||||||
|
Ok(self
|
||||||
|
.tokenizer
|
||||||
|
.get_pre_tokenizer()
|
||||||
|
.map(|pretok| PreTokenizer {
|
||||||
|
pretok: Container::from_ref(pretok),
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[getter]
|
||||||
|
fn get_post_processor(&self) -> PyResult<Option<PostProcessor>> {
|
||||||
|
Ok(self
|
||||||
|
.tokenizer
|
||||||
|
.get_post_processor()
|
||||||
|
.map(|processor| PostProcessor {
|
||||||
|
processor: Container::from_ref(processor),
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[getter]
|
||||||
|
fn get_decoder(&self) -> PyResult<Option<Decoder>> {
|
||||||
|
Ok(self.tokenizer.get_decoder().map(|decoder| Decoder {
|
||||||
|
decoder: Container::from_ref(decoder),
|
||||||
|
}))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,6 +17,11 @@ impl<T> Container<T>
|
|||||||
where
|
where
|
||||||
T: ?Sized,
|
T: ?Sized,
|
||||||
{
|
{
|
||||||
|
pub fn from_ref(reference: &Box<T>) -> Self {
|
||||||
|
let content: *const T = &**reference;
|
||||||
|
Container::Pointer(content as *mut _)
|
||||||
|
}
|
||||||
|
|
||||||
/// Consumes ourself and return the Boxed element if we have the ownership, None otherwise.
|
/// Consumes ourself and return the Boxed element if we have the ownership, None otherwise.
|
||||||
pub fn take(self) -> Option<Box<T>> {
|
pub fn take(self) -> Option<Box<T>> {
|
||||||
match self {
|
match self {
|
||||||
|
|||||||
@@ -165,30 +165,60 @@ impl Tokenizer {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get the normalizer
|
||||||
|
#[allow(clippy::borrowed_box)]
|
||||||
|
pub fn get_normalizer(&self) -> Option<&Box<dyn Normalizer + Sync>> {
|
||||||
|
self.normalizer.as_ref()
|
||||||
|
}
|
||||||
|
|
||||||
/// Set the pre tokenizer
|
/// Set the pre tokenizer
|
||||||
pub fn with_pre_tokenizer(&mut self, pre_tokenizer: Box<dyn PreTokenizer + Sync>) -> &Self {
|
pub fn with_pre_tokenizer(&mut self, pre_tokenizer: Box<dyn PreTokenizer + Sync>) -> &Self {
|
||||||
self.pre_tokenizer = Some(pre_tokenizer);
|
self.pre_tokenizer = Some(pre_tokenizer);
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get the pre tokenizer
|
||||||
|
#[allow(clippy::borrowed_box)]
|
||||||
|
pub fn get_pre_tokenizer(&self) -> Option<&Box<dyn PreTokenizer + Sync>> {
|
||||||
|
self.pre_tokenizer.as_ref()
|
||||||
|
}
|
||||||
|
|
||||||
/// Set the post processor
|
/// Set the post processor
|
||||||
pub fn with_post_processor(&mut self, post_processor: Box<dyn PostProcessor + Sync>) -> &Self {
|
pub fn with_post_processor(&mut self, post_processor: Box<dyn PostProcessor + Sync>) -> &Self {
|
||||||
self.post_processor = Some(post_processor);
|
self.post_processor = Some(post_processor);
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get the post processor
|
||||||
|
#[allow(clippy::borrowed_box)]
|
||||||
|
pub fn get_post_processor(&self) -> Option<&Box<dyn PostProcessor + Sync>> {
|
||||||
|
self.post_processor.as_ref()
|
||||||
|
}
|
||||||
|
|
||||||
/// Set the decoder
|
/// Set the decoder
|
||||||
pub fn with_decoder(&mut self, decoder: Box<dyn Decoder + Sync>) -> &Self {
|
pub fn with_decoder(&mut self, decoder: Box<dyn Decoder + Sync>) -> &Self {
|
||||||
self.decoder = Some(decoder);
|
self.decoder = Some(decoder);
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get the decoder
|
||||||
|
#[allow(clippy::borrowed_box)]
|
||||||
|
pub fn get_decoder(&self) -> Option<&Box<dyn Decoder + Sync>> {
|
||||||
|
self.decoder.as_ref()
|
||||||
|
}
|
||||||
|
|
||||||
/// Set the model
|
/// Set the model
|
||||||
pub fn with_model(&mut self, model: Box<dyn Model + Sync>) -> &Self {
|
pub fn with_model(&mut self, model: Box<dyn Model + Sync>) -> &Self {
|
||||||
self.model = model;
|
self.model = model;
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get the model
|
||||||
|
#[allow(clippy::borrowed_box)]
|
||||||
|
pub fn get_model(&self) -> &Box<dyn Model + Sync> {
|
||||||
|
&self.model
|
||||||
|
}
|
||||||
|
|
||||||
/// Set the truncation parameters
|
/// Set the truncation parameters
|
||||||
pub fn with_truncation(&mut self, trunc: Option<TruncationParams>) -> &Self {
|
pub fn with_truncation(&mut self, trunc: Option<TruncationParams>) -> &Self {
|
||||||
self.trunc = trunc;
|
self.trunc = trunc;
|
||||||
|
|||||||
Reference in New Issue
Block a user