mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Python - add BasicPreTokenizer
This commit is contained in:
@ -4,6 +4,7 @@ use super::utils::Container;
|
||||
use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::*;
|
||||
use std::collections::HashSet;
|
||||
|
||||
#[pyclass]
|
||||
pub struct PreTokenizer {
|
||||
@ -36,6 +37,27 @@ impl ByteLevel {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct BasicPreTokenizer {}
|
||||
#[pymethods]
|
||||
impl BasicPreTokenizer {
|
||||
#[staticmethod]
|
||||
fn new() -> PyResult<PreTokenizer> {
|
||||
// TODO: Parse kwargs for these
|
||||
let mut do_lower_case = true;
|
||||
let mut never_split = HashSet::new();
|
||||
let mut tokenize_chinese_chars = true;
|
||||
|
||||
Ok(PreTokenizer {
|
||||
pretok: Container::Owned(Box::new(tk::pre_tokenizers::basic::BasicPreTokenizer::new(
|
||||
do_lower_case,
|
||||
never_split,
|
||||
tokenize_chinese_chars,
|
||||
))),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Attempt at providing Python the ability to give its own PreTokenizer
|
||||
struct PyPreTokenizer {
|
||||
class: PyObject,
|
||||
|
Reference in New Issue
Block a user