Python - Update BpeTrainer interface

This commit is contained in:
Anthony MOI
2020-01-03 16:33:11 -05:00
parent 1dda76659f
commit e64b54b29e

View File

@ -1,5 +1,6 @@
extern crate tokenizers as tk; extern crate tokenizers as tk;
use super::error::ToPyResult;
use super::utils::Container; use super::utils::Container;
use pyo3::prelude::*; use pyo3::prelude::*;
use pyo3::types::*; use pyo3::types::*;
@ -20,32 +21,40 @@ impl BpeTrainer {
#[staticmethod] #[staticmethod]
#[args(kwargs = "**")] #[args(kwargs = "**")]
pub fn new(kwargs: Option<&PyDict>) -> PyResult<Trainer> { pub fn new(kwargs: Option<&PyDict>) -> PyResult<Trainer> {
let mut trainer = tk::models::bpe::BpeTrainer::default(); let mut builder = tk::models::bpe::BpeTrainer::builder();
if let Some(kwargs) = kwargs { if let Some(kwargs) = kwargs {
for (key, val) in kwargs { for (key, val) in kwargs {
let key: &str = key.extract()?; let key: &str = key.extract()?;
match key { match key {
"vocab_size" => trainer.vocab_size = val.extract()?, "vocab_size" => builder = builder.vocab_size(val.extract()?),
"min_frequency" => trainer.min_frequency = val.extract()?, "min_frequency" => builder = builder.min_frequency(val.extract()?),
"show_progress" => trainer.show_progress = val.extract()?, "show_progress" => builder = builder.show_progress(val.extract()?),
"special_tokens" => trainer.special_tokens = val.extract()?, "special_tokens" => builder = builder.special_tokens(val.extract()?),
"limit_alphabet" => trainer.limit_alphabet = val.extract()?, "limit_alphabet" => builder = builder.limit_alphabet(val.extract()?),
"initial_alphabet" => { "initial_alphabet" => {
let alphabet: Vec<String> = val.extract()?; let alphabet: Vec<String> = val.extract()?;
trainer.initial_alphabet = alphabet builder = builder.initial_alphabet(
alphabet
.into_iter() .into_iter()
.map(|s| s.chars().nth(0)) .map(|s| s.chars().nth(0))
.filter(|c| c.is_some()) .filter(|c| c.is_some())
.map(|c| c.unwrap()) .map(|c| c.unwrap())
.collect(); .collect(),
);
} }
"continuing_subword_prefix" => {
builder = builder.continuing_subword_prefix(val.extract()?)
}
"end_of_word_suffix" => builder = builder.end_of_word_suffix(val.extract()?),
_ => println!("Ignored unknown kwargs option {}", key), _ => println!("Ignored unknown kwargs option {}", key),
}; };
} }
} }
let trainer: PyResult<_> = ToPyResult(builder.build()).into();
Ok(Trainer { Ok(Trainer {
trainer: Container::Owned(Box::new(trainer)), trainer: Container::Owned(Box::new(trainer?)),
}) })
} }
} }