mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-09 14:18:30 +00:00
Python - Fix BPE and WordPiece builders usage
This commit is contained in:
@@ -51,10 +51,7 @@ impl BPE {
|
|||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
#[args(kwargs = "**")]
|
#[args(kwargs = "**")]
|
||||||
fn from_files(vocab: &str, merges: &str, kwargs: Option<&PyDict>) -> PyResult<Model> {
|
fn from_files(vocab: &str, merges: &str, kwargs: Option<&PyDict>) -> PyResult<Model> {
|
||||||
let builder: PyResult<_> =
|
let mut builder = tk::models::bpe::BPE::from_files(vocab, merges);
|
||||||
ToPyResult(tk::models::bpe::BPE::from_files(vocab, merges)).into();
|
|
||||||
let mut builder = builder?;
|
|
||||||
|
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
for (key, value) in kwargs {
|
for (key, value) in kwargs {
|
||||||
let key: &str = key.extract()?;
|
let key: &str = key.extract()?;
|
||||||
@@ -115,25 +112,27 @@ impl WordPiece {
|
|||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
#[args(kwargs = "**")]
|
#[args(kwargs = "**")]
|
||||||
fn from_files(vocab: &str, kwargs: Option<&PyDict>) -> PyResult<Model> {
|
fn from_files(vocab: &str, kwargs: Option<&PyDict>) -> PyResult<Model> {
|
||||||
let mut unk_token = String::from("[UNK]");
|
let mut builder = tk::models::wordpiece::WordPiece::from_files(vocab);
|
||||||
let mut max_input_chars_per_word = Some(100);
|
|
||||||
|
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
for (key, val) in kwargs {
|
for (key, val) in kwargs {
|
||||||
let key: &str = key.extract()?;
|
let key: &str = key.extract()?;
|
||||||
match key {
|
match key {
|
||||||
"unk_token" => unk_token = val.extract()?,
|
"unk_token" => {
|
||||||
"max_input_chars_per_word" => max_input_chars_per_word = Some(val.extract()?),
|
builder = builder.unk_token(val.extract()?);
|
||||||
|
}
|
||||||
|
"max_input_chars_per_word" => {
|
||||||
|
builder = builder.max_input_chars_per_word(val.extract()?);
|
||||||
|
}
|
||||||
|
"continuing_subword_prefix" => {
|
||||||
|
builder = builder.continuing_subword_prefix(val.extract()?);
|
||||||
|
}
|
||||||
_ => println!("Ignored unknown kwargs option {}", key),
|
_ => println!("Ignored unknown kwargs option {}", key),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
match tk::models::wordpiece::WordPiece::from_files(
|
match builder.build() {
|
||||||
vocab,
|
|
||||||
unk_token,
|
|
||||||
max_input_chars_per_word,
|
|
||||||
) {
|
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
println!("Errors: {:?}", e);
|
println!("Errors: {:?}", e);
|
||||||
Err(exceptions::Exception::py_err(
|
Err(exceptions::Exception::py_err(
|
||||||
|
|||||||
Reference in New Issue
Block a user