Python - Update bindings for BertPreTokenizer

This commit is contained in:
Anthony MOI
2019-12-17 17:40:56 -05:00
parent e54eee7657
commit 0a3d4a86a9
2 changed files with 7 additions and 4 deletions

View File

@ -34,7 +34,7 @@ fn models(_py: Python, m: &PyModule) -> PyResult<()> {
fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> { fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<pre_tokenizers::PreTokenizer>()?; m.add_class::<pre_tokenizers::PreTokenizer>()?;
m.add_class::<pre_tokenizers::ByteLevel>()?; m.add_class::<pre_tokenizers::ByteLevel>()?;
m.add_class::<pre_tokenizers::BasicPreTokenizer>()?; m.add_class::<pre_tokenizers::BertPreTokenizer>()?;
Ok(()) Ok(())
} }

View File

@ -39,12 +39,13 @@ impl ByteLevel {
} }
#[pyclass] #[pyclass]
pub struct BasicPreTokenizer {} pub struct BertPreTokenizer {}
#[pymethods] #[pymethods]
impl BasicPreTokenizer { impl BertPreTokenizer {
#[staticmethod] #[staticmethod]
#[args(kwargs = "**")] #[args(kwargs = "**")]
fn new(kwargs: Option<&PyDict>) -> PyResult<PreTokenizer> { fn new(kwargs: Option<&PyDict>) -> PyResult<PreTokenizer> {
let mut do_basic_tokenize = true;
let mut do_lower_case = true; let mut do_lower_case = true;
let mut never_split = HashSet::new(); let mut never_split = HashSet::new();
let mut tokenize_chinese_chars = true; let mut tokenize_chinese_chars = true;
@ -53,6 +54,7 @@ impl BasicPreTokenizer {
for (key, val) in kwargs { for (key, val) in kwargs {
let key: &str = key.extract()?; let key: &str = key.extract()?;
match key { match key {
"do_basic_tokenize" => do_basic_tokenize = val.extract()?,
"do_lower_case" => do_lower_case = val.extract()?, "do_lower_case" => do_lower_case = val.extract()?,
"tokenize_chinese_chars" => tokenize_chinese_chars = val.extract()?, "tokenize_chinese_chars" => tokenize_chinese_chars = val.extract()?,
"never_split" => { "never_split" => {
@ -65,7 +67,8 @@ impl BasicPreTokenizer {
} }
Ok(PreTokenizer { Ok(PreTokenizer {
pretok: Container::Owned(Box::new(tk::pre_tokenizers::basic::BasicPreTokenizer::new( pretok: Container::Owned(Box::new(tk::pre_tokenizers::bert::BertPreTokenizer::new(
do_basic_tokenize,
do_lower_case, do_lower_case,
never_split, never_split,
tokenize_chinese_chars, tokenize_chinese_chars,