Python - add BasicPreTokenizer

This commit is contained in:
Anthony MOI
2019-12-09 12:50:09 -05:00
parent d60d24a378
commit 3979096c52

View File

@ -4,6 +4,7 @@ use super::utils::Container;
use pyo3::exceptions;
use pyo3::prelude::*;
use pyo3::types::*;
use std::collections::HashSet;
#[pyclass]
pub struct PreTokenizer {
@ -36,6 +37,27 @@ impl ByteLevel {
}
}
#[pyclass]
pub struct BasicPreTokenizer {}
#[pymethods]
impl BasicPreTokenizer {
#[staticmethod]
fn new() -> PyResult<PreTokenizer> {
// TODO: Parse kwargs for these
let mut do_lower_case = true;
let mut never_split = HashSet::new();
let mut tokenize_chinese_chars = true;
Ok(PreTokenizer {
pretok: Container::Owned(Box::new(tk::pre_tokenizers::basic::BasicPreTokenizer::new(
do_lower_case,
never_split,
tokenize_chinese_chars,
))),
})
}
}
/// Attempt at providing Python the ability to give its own PreTokenizer
struct PyPreTokenizer {
class: PyObject,