mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Python - Can add tokens
This commit is contained in:
@ -122,6 +122,31 @@ impl Tokenizer {
|
||||
self.tokenizer.id_to_token(id)
|
||||
}
|
||||
|
||||
fn add_tokens(&mut self, tokens: &PyList) -> PyResult<usize> {
|
||||
let tokens = tokens
|
||||
.into_iter()
|
||||
.map(|token| {
|
||||
if let Ok(content) = token.extract::<String>() {
|
||||
Ok(tk::tokenizer::AddedToken {
|
||||
content,
|
||||
..Default::default()
|
||||
})
|
||||
} else if let Ok((content, single_word)) = token.extract::<(String, bool)>() {
|
||||
Ok(tk::tokenizer::AddedToken {
|
||||
content,
|
||||
single_word,
|
||||
})
|
||||
} else {
|
||||
Err(exceptions::Exception::py_err(
|
||||
"Input must be a list[str] or list[(str, bool)]",
|
||||
))
|
||||
}
|
||||
})
|
||||
.collect::<PyResult<Vec<_>>>()?;
|
||||
|
||||
Ok(self.tokenizer.add_tokens(&tokens))
|
||||
}
|
||||
|
||||
fn train(&mut self, trainer: &Trainer, files: Vec<String>) -> PyResult<()> {
|
||||
trainer.trainer.execute(|trainer| {
|
||||
if let Err(e) = self.tokenizer.train(trainer, files) {
|
||||
|
Reference in New Issue
Block a user