mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 16:49:27 +00:00
Python - Update bindings for BertPreTokenizer
This commit is contained in:
@ -34,7 +34,7 @@ fn models(_py: Python, m: &PyModule) -> PyResult<()> {
|
|||||||
fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||||
m.add_class::<pre_tokenizers::PreTokenizer>()?;
|
m.add_class::<pre_tokenizers::PreTokenizer>()?;
|
||||||
m.add_class::<pre_tokenizers::ByteLevel>()?;
|
m.add_class::<pre_tokenizers::ByteLevel>()?;
|
||||||
m.add_class::<pre_tokenizers::BasicPreTokenizer>()?;
|
m.add_class::<pre_tokenizers::BertPreTokenizer>()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -39,12 +39,13 @@ impl ByteLevel {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[pyclass]
|
#[pyclass]
|
||||||
pub struct BasicPreTokenizer {}
|
pub struct BertPreTokenizer {}
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl BasicPreTokenizer {
|
impl BertPreTokenizer {
|
||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
#[args(kwargs = "**")]
|
#[args(kwargs = "**")]
|
||||||
fn new(kwargs: Option<&PyDict>) -> PyResult<PreTokenizer> {
|
fn new(kwargs: Option<&PyDict>) -> PyResult<PreTokenizer> {
|
||||||
|
let mut do_basic_tokenize = true;
|
||||||
let mut do_lower_case = true;
|
let mut do_lower_case = true;
|
||||||
let mut never_split = HashSet::new();
|
let mut never_split = HashSet::new();
|
||||||
let mut tokenize_chinese_chars = true;
|
let mut tokenize_chinese_chars = true;
|
||||||
@ -53,6 +54,7 @@ impl BasicPreTokenizer {
|
|||||||
for (key, val) in kwargs {
|
for (key, val) in kwargs {
|
||||||
let key: &str = key.extract()?;
|
let key: &str = key.extract()?;
|
||||||
match key {
|
match key {
|
||||||
|
"do_basic_tokenize" => do_basic_tokenize = val.extract()?,
|
||||||
"do_lower_case" => do_lower_case = val.extract()?,
|
"do_lower_case" => do_lower_case = val.extract()?,
|
||||||
"tokenize_chinese_chars" => tokenize_chinese_chars = val.extract()?,
|
"tokenize_chinese_chars" => tokenize_chinese_chars = val.extract()?,
|
||||||
"never_split" => {
|
"never_split" => {
|
||||||
@ -65,7 +67,8 @@ impl BasicPreTokenizer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Ok(PreTokenizer {
|
Ok(PreTokenizer {
|
||||||
pretok: Container::Owned(Box::new(tk::pre_tokenizers::basic::BasicPreTokenizer::new(
|
pretok: Container::Owned(Box::new(tk::pre_tokenizers::bert::BertPreTokenizer::new(
|
||||||
|
do_basic_tokenize,
|
||||||
do_lower_case,
|
do_lower_case,
|
||||||
never_split,
|
never_split,
|
||||||
tokenize_chinese_chars,
|
tokenize_chinese_chars,
|
||||||
|
Reference in New Issue
Block a user