mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-04 00:09:34 +00:00
Python - Update BpeTrainer interface
This commit is contained in:
@ -1,5 +1,6 @@
|
|||||||
extern crate tokenizers as tk;
|
extern crate tokenizers as tk;
|
||||||
|
|
||||||
|
use super::error::ToPyResult;
|
||||||
use super::utils::Container;
|
use super::utils::Container;
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
use pyo3::types::*;
|
use pyo3::types::*;
|
||||||
@ -20,32 +21,40 @@ impl BpeTrainer {
|
|||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
#[args(kwargs = "**")]
|
#[args(kwargs = "**")]
|
||||||
pub fn new(kwargs: Option<&PyDict>) -> PyResult<Trainer> {
|
pub fn new(kwargs: Option<&PyDict>) -> PyResult<Trainer> {
|
||||||
let mut trainer = tk::models::bpe::BpeTrainer::default();
|
let mut builder = tk::models::bpe::BpeTrainer::builder();
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
for (key, val) in kwargs {
|
for (key, val) in kwargs {
|
||||||
let key: &str = key.extract()?;
|
let key: &str = key.extract()?;
|
||||||
match key {
|
match key {
|
||||||
"vocab_size" => trainer.vocab_size = val.extract()?,
|
"vocab_size" => builder = builder.vocab_size(val.extract()?),
|
||||||
"min_frequency" => trainer.min_frequency = val.extract()?,
|
"min_frequency" => builder = builder.min_frequency(val.extract()?),
|
||||||
"show_progress" => trainer.show_progress = val.extract()?,
|
"show_progress" => builder = builder.show_progress(val.extract()?),
|
||||||
"special_tokens" => trainer.special_tokens = val.extract()?,
|
"special_tokens" => builder = builder.special_tokens(val.extract()?),
|
||||||
"limit_alphabet" => trainer.limit_alphabet = val.extract()?,
|
"limit_alphabet" => builder = builder.limit_alphabet(val.extract()?),
|
||||||
"initial_alphabet" => {
|
"initial_alphabet" => {
|
||||||
let alphabet: Vec<String> = val.extract()?;
|
let alphabet: Vec<String> = val.extract()?;
|
||||||
trainer.initial_alphabet = alphabet
|
builder = builder.initial_alphabet(
|
||||||
|
alphabet
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|s| s.chars().nth(0))
|
.map(|s| s.chars().nth(0))
|
||||||
.filter(|c| c.is_some())
|
.filter(|c| c.is_some())
|
||||||
.map(|c| c.unwrap())
|
.map(|c| c.unwrap())
|
||||||
.collect();
|
.collect(),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
"continuing_subword_prefix" => {
|
||||||
|
builder = builder.continuing_subword_prefix(val.extract()?)
|
||||||
|
}
|
||||||
|
"end_of_word_suffix" => builder = builder.end_of_word_suffix(val.extract()?),
|
||||||
_ => println!("Ignored unknown kwargs option {}", key),
|
_ => println!("Ignored unknown kwargs option {}", key),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let trainer: PyResult<_> = ToPyResult(builder.build()).into();
|
||||||
|
|
||||||
Ok(Trainer {
|
Ok(Trainer {
|
||||||
trainer: Container::Owned(Box::new(trainer)),
|
trainer: Container::Owned(Box::new(trainer?)),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user