mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-27 02:29:38 +00:00
Python - Handle kwargs for bert modules
This commit is contained in:
@ -61,8 +61,8 @@ elif args.type == "bert":
|
|||||||
print("Running Bert tokenizer")
|
print("Running Bert tokenizer")
|
||||||
tok_p = BertTokenizer.from_pretrained('bert-base-uncased')
|
tok_p = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
|
|
||||||
tok_r = Tokenizer(models.WordPiece.from_files(args.vocab))
|
tok_r = Tokenizer(models.WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100))
|
||||||
tok_r.with_pre_tokenizer(pre_tokenizers.BasicPreTokenizer.new())
|
tok_r.with_pre_tokenizer(pre_tokenizers.BasicPreTokenizer.new(do_lower_case=True, tokenize_chinese_chars=True, never_split=[]))
|
||||||
tok_r.with_decoder(decoders.WordPiece.new())
|
tok_r.with_decoder(decoders.WordPiece.new())
|
||||||
else:
|
else:
|
||||||
raise Exception(f"Unknown type {args.type}")
|
raise Exception(f"Unknown type {args.type}")
|
||||||
|
@ -4,6 +4,7 @@ use super::utils::Container;
|
|||||||
|
|
||||||
use pyo3::exceptions;
|
use pyo3::exceptions;
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
|
use pyo3::types::*;
|
||||||
|
|
||||||
/// A Model represents some tokenization algorithm like BPE or Word
|
/// A Model represents some tokenization algorithm like BPE or Word
|
||||||
/// This class cannot be constructed directly. Please use one of the concrete models.
|
/// This class cannot be constructed directly. Please use one of the concrete models.
|
||||||
@ -71,10 +72,21 @@ impl WordPiece {
|
|||||||
///
|
///
|
||||||
/// Instantiate a new WordPiece model using the provided vocabulary file
|
/// Instantiate a new WordPiece model using the provided vocabulary file
|
||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
fn from_files(vocab: &str) -> PyResult<Model> {
|
#[args(kwargs = "**")]
|
||||||
// TODO: Parse kwargs for these
|
fn from_files(vocab: &str, kwargs: Option<&PyDict>) -> PyResult<Model> {
|
||||||
let unk_token = String::from("[UNK]");
|
let mut unk_token = String::from("[UNK]");
|
||||||
let max_input_chars_per_word = Some(100);
|
let mut max_input_chars_per_word = Some(100);
|
||||||
|
|
||||||
|
if let Some(kwargs) = kwargs {
|
||||||
|
for (key, val) in kwargs {
|
||||||
|
let key: &str = key.extract()?;
|
||||||
|
match key {
|
||||||
|
"unk_token" => unk_token = val.extract()?,
|
||||||
|
"max_input_chars_per_word" => max_input_chars_per_word = Some(val.extract()?),
|
||||||
|
_ => println!("Ignored unknown kwargs option {}", key),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
match tk::models::wordpiece::WordPiece::from_files(
|
match tk::models::wordpiece::WordPiece::from_files(
|
||||||
vocab,
|
vocab,
|
||||||
|
@ -43,12 +43,27 @@ pub struct BasicPreTokenizer {}
|
|||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl BasicPreTokenizer {
|
impl BasicPreTokenizer {
|
||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
fn new() -> PyResult<PreTokenizer> {
|
#[args(kwargs = "**")]
|
||||||
// TODO: Parse kwargs for these
|
fn new(kwargs: Option<&PyDict>) -> PyResult<PreTokenizer> {
|
||||||
let mut do_lower_case = true;
|
let mut do_lower_case = true;
|
||||||
let mut never_split = HashSet::new();
|
let mut never_split = HashSet::new();
|
||||||
let mut tokenize_chinese_chars = true;
|
let mut tokenize_chinese_chars = true;
|
||||||
|
|
||||||
|
if let Some(kwargs) = kwargs {
|
||||||
|
for (key, val) in kwargs {
|
||||||
|
let key: &str = key.extract()?;
|
||||||
|
match key {
|
||||||
|
"do_lower_case" => do_lower_case = val.extract()?,
|
||||||
|
"tokenize_chinese_chars" => tokenize_chinese_chars = val.extract()?,
|
||||||
|
"never_split" => {
|
||||||
|
let values: Vec<String> = val.extract()?;
|
||||||
|
never_split = values.into_iter().collect();
|
||||||
|
}
|
||||||
|
_ => println!("Ignored unknown kwargs option {}", key),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Ok(PreTokenizer {
|
Ok(PreTokenizer {
|
||||||
pretok: Container::Owned(Box::new(tk::pre_tokenizers::basic::BasicPreTokenizer::new(
|
pretok: Container::Owned(Box::new(tk::pre_tokenizers::basic::BasicPreTokenizer::new(
|
||||||
do_lower_case,
|
do_lower_case,
|
||||||
|
Reference in New Issue
Block a user