mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Python - Add WordPieceTrainer
This commit is contained in:
@ -18,6 +18,7 @@ use pyo3::wrap_pymodule;
|
||||
fn trainers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<trainers::Trainer>()?;
|
||||
m.add_class::<trainers::BpeTrainer>()?;
|
||||
m.add_class::<trainers::WordPieceTrainer>()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -14,7 +14,7 @@ pub struct Trainer {
|
||||
pub struct BpeTrainer {}
|
||||
#[pymethods]
|
||||
impl BpeTrainer {
|
||||
/// new(/vocab_size, min_frequency)
|
||||
/// new(/ vocab_size, min_frequency)
|
||||
/// --
|
||||
///
|
||||
/// Create a new BpeTrainer with the given configuration
|
||||
@ -58,3 +58,52 @@ impl BpeTrainer {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct WordPieceTrainer {}
|
||||
#[pymethods]
|
||||
impl WordPieceTrainer {
|
||||
/// new(/ vocab_size, min_frequency)
|
||||
/// --
|
||||
///
|
||||
/// Create a new BpeTrainer with the given configuration
|
||||
#[staticmethod]
|
||||
#[args(kwargs = "**")]
|
||||
pub fn new(kwargs: Option<&PyDict>) -> PyResult<Trainer> {
|
||||
let mut builder = tk::models::wordpiece::WordPieceTrainer::builder();
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, val) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
match key {
|
||||
"vocab_size" => builder = builder.vocab_size(val.extract()?),
|
||||
"min_frequency" => builder = builder.min_frequency(val.extract()?),
|
||||
"show_progress" => builder = builder.show_progress(val.extract()?),
|
||||
"special_tokens" => builder = builder.special_tokens(val.extract()?),
|
||||
"limit_alphabet" => builder = builder.limit_alphabet(val.extract()?),
|
||||
"initial_alphabet" => {
|
||||
let alphabet: Vec<String> = val.extract()?;
|
||||
builder = builder.initial_alphabet(
|
||||
alphabet
|
||||
.into_iter()
|
||||
.map(|s| s.chars().nth(0))
|
||||
.filter(|c| c.is_some())
|
||||
.map(|c| c.unwrap())
|
||||
.collect(),
|
||||
);
|
||||
}
|
||||
"continuing_subword_prefix" => {
|
||||
builder = builder.continuing_subword_prefix(val.extract()?)
|
||||
}
|
||||
"end_of_word_suffix" => builder = builder.end_of_word_suffix(val.extract()?),
|
||||
_ => println!("Ignored unknown kwargs option {}", key),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
let trainer: PyResult<_> = ToPyResult(builder.build()).into();
|
||||
|
||||
Ok(Trainer {
|
||||
trainer: Container::Owned(Box::new(trainer?)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user